Updating Kokkos lib to 2.03.00
This commit is contained in:
@ -1,5 +1,28 @@
|
||||
# Change Log
|
||||
|
||||
## [2.03.00](https://github.com/kokkos/kokkos/tree/2.03.00) (2017-04-25)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.15...2.03.00)
|
||||
|
||||
**Implemented enhancements:**
|
||||
|
||||
- UnorderedMap: make it accept Devices or MemorySpaces [\#711](https://github.com/kokkos/kokkos/issues/711)
|
||||
- sort to accept DynamicView and \[begin,end\) indices [\#691](https://github.com/kokkos/kokkos/issues/691)
|
||||
- ENABLE Macros should only be used via \#ifdef or \#if defined [\#675](https://github.com/kokkos/kokkos/issues/675)
|
||||
- Remove impl/Kokkos\_Synchronic\_\* [\#666](https://github.com/kokkos/kokkos/issues/666)
|
||||
- Turning off IVDEP for Intel 14. [\#638](https://github.com/kokkos/kokkos/issues/638)
|
||||
- Using an installed Kokkos in a target application using CMake [\#633](https://github.com/kokkos/kokkos/issues/633)
|
||||
- Create Kokkos Bill of Materials [\#632](https://github.com/kokkos/kokkos/issues/632)
|
||||
- MDRangePolicy and tagged evaluators [\#547](https://github.com/kokkos/kokkos/issues/547)
|
||||
- Add PGI support [\#289](https://github.com/kokkos/kokkos/issues/289)
|
||||
|
||||
**Fixed bugs:**
|
||||
|
||||
- Output from PerTeam fails [\#733](https://github.com/kokkos/kokkos/issues/733)
|
||||
- Cuda: architecture flag not added to link line [\#688](https://github.com/kokkos/kokkos/issues/688)
|
||||
- Getting large chunks of memory for a thread team in a universal way [\#664](https://github.com/kokkos/kokkos/issues/664)
|
||||
- Kokkos RNG normal\(\) function hangs for small seed value [\#655](https://github.com/kokkos/kokkos/issues/655)
|
||||
- Kokkos Tests Errors on Shepard/HSW Builds [\#644](https://github.com/kokkos/kokkos/issues/644)
|
||||
|
||||
## [2.02.15](https://github.com/kokkos/kokkos/tree/2.02.15) (2017-02-10)
|
||||
[Full Changelog](https://github.com/kokkos/kokkos/compare/2.02.07...2.02.15)
|
||||
|
||||
|
||||
@ -98,10 +98,10 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
Kokkos_ENABLE_QTHREAD
|
||||
KOKKOS_HAVE_QTHREAD
|
||||
"Enable QTHREAD support in Kokkos."
|
||||
"${TPL_ENABLE_QTHREAD}"
|
||||
Kokkos_ENABLE_Qthreads
|
||||
KOKKOS_HAVE_QTHREADS
|
||||
"Enable Qthreads support in Kokkos."
|
||||
"${TPL_ENABLE_QTHREADS}"
|
||||
)
|
||||
|
||||
TRIBITS_ADD_OPTION_AND_DEFINE(
|
||||
@ -213,4 +213,3 @@ TRIBITS_EXCLUDE_FILES(
|
||||
)
|
||||
|
||||
TRIBITS_PACKAGE_POSTPROCESS()
|
||||
|
||||
|
||||
@ -1,39 +1,38 @@
|
||||
# Default settings common options
|
||||
# Default settings common options.
|
||||
|
||||
#LAMMPS specific settings:
|
||||
KOKKOS_PATH=../../lib/kokkos
|
||||
CXXFLAGS=$(CCFLAGS)
|
||||
|
||||
#Options: OpenMP,Serial,Pthreads,Cuda
|
||||
# Options: Cuda,OpenMP,Pthreads,Qthreads,Serial
|
||||
KOKKOS_DEVICES ?= "OpenMP"
|
||||
#KOKKOS_DEVICES ?= "Pthreads"
|
||||
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
|
||||
# Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,ARMv80,ARMv81,ARMv8-ThunderX,BGQ,Power7,Power8,Power9,KNL,BDW,SKX
|
||||
KOKKOS_ARCH ?= ""
|
||||
#Options: yes,no
|
||||
# Options: yes,no
|
||||
KOKKOS_DEBUG ?= "no"
|
||||
#Options: hwloc,librt,experimental_memkind
|
||||
# Options: hwloc,librt,experimental_memkind
|
||||
KOKKOS_USE_TPLS ?= ""
|
||||
#Options: c++11,c++1z
|
||||
# Options: c++11,c++1z
|
||||
KOKKOS_CXX_STANDARD ?= "c++11"
|
||||
#Options: aggressive_vectorization,disable_profiling
|
||||
# Options: aggressive_vectorization,disable_profiling
|
||||
KOKKOS_OPTIONS ?= ""
|
||||
|
||||
#Default settings specific options
|
||||
#Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
# Default settings specific options.
|
||||
# Options: force_uvm,use_ldg,rdc,enable_lambda
|
||||
KOKKOS_CUDA_OPTIONS ?= "enable_lambda"
|
||||
|
||||
# Check for general settings
|
||||
|
||||
# Check for general settings.
|
||||
KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l))
|
||||
KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l))
|
||||
KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l))
|
||||
|
||||
# Check for external libraries
|
||||
# Check for external libraries.
|
||||
KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l))
|
||||
KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l))
|
||||
KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l))
|
||||
|
||||
# Check for advanced settings
|
||||
# Check for advanced settings.
|
||||
KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l))
|
||||
KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l))
|
||||
KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l))
|
||||
@ -41,21 +40,21 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | gr
|
||||
KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l))
|
||||
KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l))
|
||||
|
||||
# Check for Kokkos Host Execution Spaces one of which must be on
|
||||
|
||||
# Check for Kokkos Host Execution Spaces one of which must be on.
|
||||
KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMP | wc -l))
|
||||
KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l))
|
||||
KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l))
|
||||
KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l))
|
||||
KOKKOS_INTERNAL_USE_QTHREAD := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthread | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0)
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 0)
|
||||
KOKKOS_INTERNAL_USE_SERIAL := 1
|
||||
endif
|
||||
endif
|
||||
endif
|
||||
|
||||
# Check for other Execution Spaces
|
||||
|
||||
# Check for other Execution Spaces.
|
||||
KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
@ -64,15 +63,13 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_NVCC_VERSION := $(shell nvcc --version 2>&1 | grep release | cut -d' ' -f5 | cut -d',' -f1 | tr -d .)
|
||||
endif
|
||||
|
||||
# Check OS
|
||||
|
||||
# Check OS.
|
||||
KOKKOS_OS := $(shell uname -s)
|
||||
KOKKOS_INTERNAL_OS_CYGWIN := $(shell uname -s | grep CYGWIN | wc -l)
|
||||
KOKKOS_INTERNAL_OS_LINUX := $(shell uname -s | grep Linux | wc -l)
|
||||
KOKKOS_INTERNAL_OS_DARWIN := $(shell uname -s | grep Darwin | wc -l)
|
||||
|
||||
# Check compiler
|
||||
|
||||
# Check compiler.
|
||||
KOKKOS_INTERNAL_COMPILER_INTEL := $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_PGI := $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)
|
||||
KOKKOS_INTERNAL_COMPILER_XL := $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)
|
||||
@ -95,6 +92,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CLANG_VERSION := $(shell clang --version | grep version | cut -d ' ' -f3 | tr -d '.')
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_CLANG_VERSION) -lt 400; echo $$?),0)
|
||||
$(error Compiling Cuda code directly with Clang requires version 4.0.0 or higher)
|
||||
@ -103,7 +101,6 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
endif
|
||||
endif
|
||||
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -mp
|
||||
else
|
||||
@ -114,7 +111,7 @@ else
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -qsmp=omp
|
||||
else
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
# OpenMP is turned on by default in Cray compiler environment
|
||||
# OpenMP is turned on by default in Cray compiler environment.
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG :=
|
||||
else
|
||||
KOKKOS_INTERNAL_OPENMP_FLAG := -fopenmp
|
||||
@ -138,9 +135,9 @@ else
|
||||
endif
|
||||
endif
|
||||
|
||||
# Check for Kokkos Architecture settings
|
||||
# Check for Kokkos Architecture settings.
|
||||
|
||||
#Intel based
|
||||
# Intel based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
|
||||
@ -148,7 +145,7 @@ KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW |
|
||||
KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
|
||||
|
||||
#NVIDIA based
|
||||
# NVIDIA based.
|
||||
NVCC_WRAPPER := $(KOKKOS_PATH)/config/nvcc_wrapper
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l))
|
||||
@ -170,9 +167,9 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0)
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
|
||||
@ -183,33 +180,33 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
|
||||
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
|
||||
endif
|
||||
|
||||
#ARM based
|
||||
# ARM based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l))
|
||||
|
||||
#IBM based
|
||||
# IBM based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l))
|
||||
KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc))
|
||||
|
||||
#AMD based
|
||||
# AMD based.
|
||||
KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l))
|
||||
|
||||
#Any AVX?
|
||||
# Any AVX?
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
|
||||
# Decide what ISA level we are able to support
|
||||
# Decide what ISA level we are able to support.
|
||||
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
|
||||
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc ))
|
||||
|
||||
#Incompatible flags?
|
||||
# Incompatible flags?
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX)>1" | bc ))
|
||||
KOKKOS_INTERNAL_USE_ARCH_MULTIGPU := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_NVIDIA)>1" | bc))
|
||||
|
||||
@ -220,7 +217,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MULTIGPU), 1)
|
||||
$(error Defined Multiple GPU architectures: KOKKOS_ARCH=$(KOKKOS_ARCH) )
|
||||
endif
|
||||
|
||||
#Generating the list of Flags
|
||||
# Generating the list of Flags.
|
||||
|
||||
KOKKOS_CPPFLAGS = -I./ -I$(KOKKOS_PATH)/core/src -I$(KOKKOS_PATH)/containers/src -I$(KOKKOS_PATH)/algorithms/src
|
||||
|
||||
@ -236,15 +233,19 @@ KOKKOS_LDFLAGS = -L$(shell pwd)
|
||||
KOKKOS_SRC =
|
||||
KOKKOS_HEADERS =
|
||||
|
||||
#Generating the KokkosCore_config.h file
|
||||
# Generating the KokkosCore_config.h file.
|
||||
|
||||
tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp)
|
||||
tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp)
|
||||
tmp := $(shell date >> KokkosCore_config.tmp)
|
||||
tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp)
|
||||
|
||||
|
||||
tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp)
|
||||
endif
|
||||
@ -253,12 +254,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp )
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1)
|
||||
@ -279,12 +280,6 @@ ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1)
|
||||
tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
KOKKOS_CPPFLAGS += -I$(QTHREAD_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(QTHREAD_PATH)/lib
|
||||
tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREAD 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
|
||||
tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp)
|
||||
ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1)
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG)
|
||||
@ -341,6 +336,7 @@ endif
|
||||
tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
@ -365,16 +361,19 @@ ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1)
|
||||
$(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp )
|
||||
endif
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
#Add Architecture flags
|
||||
# Add Architecture flags.
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
@ -391,6 +390,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
@ -408,6 +408,7 @@ endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1)
|
||||
KOKKOS_CXXFLAGS +=
|
||||
KOKKOS_LDFLAGS +=
|
||||
@ -424,6 +425,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
KOKKOS_LDFLAGS += -mavx
|
||||
@ -435,7 +437,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=sandybridge
|
||||
KOKKOS_LDFLAGS += -tp=sandybridge
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
# Assume that this is a really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -mavx
|
||||
KOKKOS_LDFLAGS += -mavx
|
||||
endif
|
||||
@ -445,10 +447,11 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P8
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P8.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
|
||||
KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
|
||||
endif
|
||||
@ -456,10 +459,11 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P9
|
||||
# Assume that this is a really a GNU compiler or it could be XL on P9.
|
||||
KOKKOS_CXXFLAGS += -mcpu=power9 -mtune=power9
|
||||
KOKKOS_LDFLAGS += -mcpu=power9 -mtune=power9
|
||||
endif
|
||||
@ -467,6 +471,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX2
|
||||
KOKKOS_LDFLAGS += -xCORE-AVX2
|
||||
@ -478,7 +483,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
|
||||
KOKKOS_CXXFLAGS += -tp=haswell
|
||||
KOKKOS_LDFLAGS += -tp=haswell
|
||||
else
|
||||
# Assume that this is a really a GNU compiler
|
||||
# Assume that this is a really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
|
||||
KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
|
||||
endif
|
||||
@ -488,6 +493,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xMIC-AVX512
|
||||
KOKKOS_LDFLAGS += -xMIC-AVX512
|
||||
@ -498,7 +504,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Asssume that this is really a GNU compiler
|
||||
# Asssume that this is really a GNU compiler.
|
||||
KOKKOS_CXXFLAGS += -march=knl
|
||||
KOKKOS_LDFLAGS += -march=knl
|
||||
endif
|
||||
@ -508,6 +514,7 @@ endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp )
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1)
|
||||
KOKKOS_CXXFLAGS += -xCORE-AVX512
|
||||
KOKKOS_LDFLAGS += -xCORE-AVX512
|
||||
@ -518,7 +525,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1)
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1)
|
||||
|
||||
else
|
||||
# Nothing here yet
|
||||
# Nothing here yet.
|
||||
KOKKOS_CXXFLAGS += -march=skylake-avx512
|
||||
KOKKOS_LDFLAGS += -march=skylake-avx512
|
||||
endif
|
||||
@ -532,67 +539,79 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1)
|
||||
KOKKOS_LDFLAGS += -mmic
|
||||
endif
|
||||
|
||||
#Figure out the architecture flag for Cuda
|
||||
# Figure out the architecture flag for Cuda.
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-arch
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=-x cuda --cuda-gpu-arch
|
||||
KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG=--cuda-gpu-arch
|
||||
KOKKOS_CXXFLAGS += -x cuda
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_30
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_32
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_35
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_37
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_50
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_52
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_53
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_61
|
||||
endif
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1)
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
|
||||
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp )
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_COMPILER_CUDA_ARCH_FLAG)=sm_60
|
||||
endif
|
||||
|
||||
endif
|
||||
|
||||
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)
|
||||
ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h)
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l))
|
||||
else
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := 1
|
||||
KOKKOS_INTERNAL_NEW_CONFIG := 1
|
||||
endif
|
||||
|
||||
ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0)
|
||||
@ -616,30 +635,34 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
|
||||
KOKKOS_LIBS += -lcudart -lcuda
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
KOKKOS_LIBS += -lpthread
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
KOKKOS_LIBS += -lqthread
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp)
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1)
|
||||
KOKKOS_CXXFLAGS += -Xcompiler $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
else
|
||||
KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
endif
|
||||
|
||||
KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENMP_FLAG)
|
||||
endif
|
||||
|
||||
#Explicitly set the GCC Toolchain for Clang
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp)
|
||||
KOKKOS_LIBS += -lpthread
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
KOKKOS_SRC += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.cpp)
|
||||
KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
|
||||
KOKKOS_CPPFLAGS += -I$(QTHREADS_PATH)/include
|
||||
KOKKOS_LDFLAGS += -L$(QTHREADS_PATH)/lib
|
||||
KOKKOS_LIBS += -lqthread
|
||||
endif
|
||||
|
||||
# Explicitly set the GCC Toolchain for Clang.
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_INTERNAL_GCC_PATH = $(shell which g++)
|
||||
KOKKOS_INTERNAL_GCC_TOOLCHAIN = $(KOKKOS_INTERNAL_GCC_PATH:/bin/g++=)
|
||||
@ -647,15 +670,15 @@ ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1)
|
||||
KOKKOS_LDFLAGS += --gcc-toolchain=$(KOKKOS_INTERNAL_GCC_TOOLCHAIN)
|
||||
endif
|
||||
|
||||
#With Cygwin functions such as fdopen and fileno are not defined
|
||||
#when strict ansi is enabled. strict ansi gets enabled with --std=c++11
|
||||
#though. So we hard undefine it here. Not sure if that has any bad side effects
|
||||
#This is needed for gtest actually, not for Kokkos itself!
|
||||
# With Cygwin functions such as fdopen and fileno are not defined
|
||||
# when strict ansi is enabled. strict ansi gets enabled with --std=c++11
|
||||
# though. So we hard undefine it here. Not sure if that has any bad side effects
|
||||
# This is needed for gtest actually, not for Kokkos itself!
|
||||
ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1)
|
||||
KOKKOS_CXXFLAGS += -U__STRICT_ANSI__
|
||||
endif
|
||||
|
||||
# Setting up dependencies
|
||||
# Setting up dependencies.
|
||||
|
||||
KokkosCore_config.h:
|
||||
|
||||
|
||||
@ -18,6 +18,8 @@ Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
|
||||
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
|
||||
Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp
|
||||
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
|
||||
Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp
|
||||
@ -43,11 +45,11 @@ Kokkos_ThreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Threads/Kokk
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Threads/Kokkos_ThreadsExec.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
Kokkos_QthreadExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_QthreadExec.cpp
|
||||
Kokkos_Qthread_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthread/Kokkos_Qthread_TaskPolicy.cpp
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
Kokkos_QthreadsExec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_QthreadsExec.cpp
|
||||
Kokkos_Qthreads_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
@ -59,4 +61,3 @@ endif
|
||||
|
||||
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp
|
||||
|
||||
|
||||
@ -45,31 +45,39 @@ Primary tested compilers on X86 are:
|
||||
GCC 4.8.4
|
||||
GCC 4.9.2
|
||||
GCC 5.1.0
|
||||
GCC 5.2.0
|
||||
Intel 14.0.4
|
||||
Intel 15.0.2
|
||||
Intel 16.0.1
|
||||
Intel 17.0.098
|
||||
Intel 17.1.132
|
||||
Clang 3.5.2
|
||||
Clang 3.6.1
|
||||
Clang 3.7.1
|
||||
Clang 3.8.1
|
||||
Clang 3.9.0
|
||||
PGI 17.1
|
||||
|
||||
Primary tested compilers on Power 8 are:
|
||||
GCC 5.4.0 (OpenMP,Serial)
|
||||
IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug)
|
||||
|
||||
Primary tested compilers on Intel KNL are:
|
||||
GCC 6.2.0
|
||||
Intel 16.2.181 (with gcc 4.7.2)
|
||||
Intel 17.0.098 (with gcc 4.7.2)
|
||||
Intel 17.1.132 (with gcc 4.9.3)
|
||||
Intel 17.2.174 (with gcc 4.9.3)
|
||||
Intel 18.0.061 (beta) (with gcc 4.9.3)
|
||||
|
||||
Secondary tested compilers are:
|
||||
CUDA 7.0 (with gcc 4.7.2)
|
||||
CUDA 7.5 (with gcc 4.7.2)
|
||||
CUDA 7.0 (with gcc 4.8.4)
|
||||
CUDA 7.5 (with gcc 4.8.4)
|
||||
CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8)
|
||||
CUDA/Clang 8.0 using Clang/Trunk compiler
|
||||
|
||||
Other compilers working:
|
||||
X86:
|
||||
PGI 15.4
|
||||
Cygwin 2.1.0 64bit with gcc 4.9.3
|
||||
|
||||
Known non-working combinations:
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_REQUIRED_PACKAGES KokkosCore
|
||||
LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
@ -547,7 +547,7 @@ namespace Kokkos {
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Random_XorShift64 (uint64_t state, int state_idx = 0)
|
||||
: state_(state),state_idx_(state_idx){}
|
||||
: state_(state==0?uint64_t(1318319):state),state_idx_(state_idx){}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
uint32_t urand() {
|
||||
@ -719,6 +719,9 @@ namespace Kokkos {
|
||||
}
|
||||
|
||||
void init(uint64_t seed, int num_states) {
|
||||
if(seed==0)
|
||||
seed = uint64_t(1318319);
|
||||
|
||||
num_states_ = num_states;
|
||||
|
||||
locks_ = lock_type("Kokkos::Random_XorShift64::locks",num_states_);
|
||||
@ -968,8 +971,9 @@ namespace Kokkos {
|
||||
|
||||
inline
|
||||
void init(uint64_t seed, int num_states) {
|
||||
if(seed==0)
|
||||
seed = uint64_t(1318319);
|
||||
num_states_ = num_states;
|
||||
|
||||
locks_ = int_view_type("Kokkos::Random_XorShift1024::locks",num_states_);
|
||||
state_ = state_data_type("Kokkos::Random_XorShift1024::state",num_states_);
|
||||
p_ = int_view_type("Kokkos::Random_XorShift1024::p",num_states_);
|
||||
|
||||
@ -53,69 +53,122 @@ namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<class ValuesViewType, int Rank=ValuesViewType::Rank>
|
||||
template< class DstViewType , class SrcViewType
|
||||
, int Rank = DstViewType::Rank >
|
||||
struct CopyOp;
|
||||
|
||||
template<class ValuesViewType>
|
||||
struct CopyOp<ValuesViewType,1> {
|
||||
template<class DstType, class SrcType>
|
||||
template< class DstViewType , class SrcViewType >
|
||||
struct CopyOp<DstViewType,SrcViewType,1> {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void copy(DstType& dst, size_t i_dst,
|
||||
SrcType& src, size_t i_src ) {
|
||||
static void copy(DstViewType const& dst, size_t i_dst,
|
||||
SrcViewType const& src, size_t i_src ) {
|
||||
dst(i_dst) = src(i_src);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ValuesViewType>
|
||||
struct CopyOp<ValuesViewType,2> {
|
||||
template<class DstType, class SrcType>
|
||||
template< class DstViewType , class SrcViewType >
|
||||
struct CopyOp<DstViewType,SrcViewType,2> {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void copy(DstType& dst, size_t i_dst,
|
||||
SrcType& src, size_t i_src ) {
|
||||
for(int j = 0;j< (int) dst.dimension_1(); j++)
|
||||
static void copy(DstViewType const& dst, size_t i_dst,
|
||||
SrcViewType const& src, size_t i_src ) {
|
||||
for(int j = 0;j< (int) dst.extent(1); j++)
|
||||
dst(i_dst,j) = src(i_src,j);
|
||||
}
|
||||
};
|
||||
|
||||
template<class ValuesViewType>
|
||||
struct CopyOp<ValuesViewType,3> {
|
||||
template<class DstType, class SrcType>
|
||||
template< class DstViewType , class SrcViewType >
|
||||
struct CopyOp<DstViewType,SrcViewType,3> {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void copy(DstType& dst, size_t i_dst,
|
||||
SrcType& src, size_t i_src ) {
|
||||
for(int j = 0; j<dst.dimension_1(); j++)
|
||||
for(int k = 0; k<dst.dimension_2(); k++)
|
||||
static void copy(DstViewType const& dst, size_t i_dst,
|
||||
SrcViewType const& src, size_t i_src ) {
|
||||
for(int j = 0; j<dst.extent(1); j++)
|
||||
for(int k = 0; k<dst.extent(2); k++)
|
||||
dst(i_dst,j,k) = src(i_src,j,k);
|
||||
}
|
||||
};
|
||||
}
|
||||
|
||||
template<class KeyViewType, class BinSortOp, class ExecutionSpace = typename KeyViewType::execution_space,
|
||||
class SizeType = typename KeyViewType::memory_space::size_type>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class KeyViewType
|
||||
, class BinSortOp
|
||||
, class Space = typename KeyViewType::device_type
|
||||
, class SizeType = typename KeyViewType::memory_space::size_type
|
||||
>
|
||||
class BinSort {
|
||||
|
||||
|
||||
public:
|
||||
template<class ValuesViewType, class PermuteViewType, class CopyOp>
|
||||
struct bin_sort_sort_functor {
|
||||
typedef ExecutionSpace execution_space;
|
||||
typedef typename ValuesViewType::non_const_type values_view_type;
|
||||
typedef typename ValuesViewType::const_type const_values_view_type;
|
||||
Kokkos::View<typename values_view_type::const_data_type,typename values_view_type::array_layout,
|
||||
typename values_view_type::memory_space,Kokkos::MemoryTraits<Kokkos::RandomAccess> > values;
|
||||
values_view_type sorted_values;
|
||||
typename PermuteViewType::const_type sort_order;
|
||||
bin_sort_sort_functor(const_values_view_type values_, values_view_type sorted_values_, PermuteViewType sort_order_):
|
||||
values(values_),sorted_values(sorted_values_),sort_order(sort_order_) {}
|
||||
|
||||
template< class DstViewType , class SrcViewType >
|
||||
struct copy_functor {
|
||||
|
||||
typedef typename SrcViewType::const_type src_view_type ;
|
||||
|
||||
typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
|
||||
|
||||
DstViewType dst_values ;
|
||||
src_view_type src_values ;
|
||||
int dst_offset ;
|
||||
|
||||
copy_functor( DstViewType const & dst_values_
|
||||
, int const & dst_offset_
|
||||
, SrcViewType const & src_values_
|
||||
)
|
||||
: dst_values( dst_values_ )
|
||||
, src_values( src_values_ )
|
||||
, dst_offset( dst_offset_ )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
//printf("Sort: %i %i\n",i,sort_order(i));
|
||||
CopyOp::copy(sorted_values,i,values,sort_order(i));
|
||||
// printf("copy: dst(%i) src(%i)\n",i+dst_offset,i);
|
||||
copy_op::copy(dst_values,i+dst_offset,src_values,i);
|
||||
}
|
||||
};
|
||||
|
||||
typedef ExecutionSpace execution_space;
|
||||
template< class DstViewType
|
||||
, class PermuteViewType
|
||||
, class SrcViewType
|
||||
>
|
||||
struct copy_permute_functor {
|
||||
|
||||
// If a Kokkos::View then can generate constant random access
|
||||
// otherwise can only use the constant type.
|
||||
|
||||
typedef typename std::conditional
|
||||
< Kokkos::is_view< SrcViewType >::value
|
||||
, Kokkos::View< typename SrcViewType::const_data_type
|
||||
, typename SrcViewType::array_layout
|
||||
, typename SrcViewType::device_type
|
||||
, Kokkos::MemoryTraits<Kokkos::RandomAccess>
|
||||
>
|
||||
, typename SrcViewType::const_type
|
||||
>::type src_view_type ;
|
||||
|
||||
typedef typename PermuteViewType::const_type perm_view_type ;
|
||||
|
||||
typedef Impl::CopyOp< DstViewType , src_view_type > copy_op ;
|
||||
|
||||
DstViewType dst_values ;
|
||||
perm_view_type sort_order ;
|
||||
src_view_type src_values ;
|
||||
|
||||
copy_permute_functor( DstViewType const & dst_values_
|
||||
, PermuteViewType const & sort_order_
|
||||
, SrcViewType const & src_values_
|
||||
)
|
||||
: dst_values( dst_values_ )
|
||||
, sort_order( sort_order_ )
|
||||
, src_values( src_values_ )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const int& i) const {
|
||||
// printf("copy_permute: dst(%i) src(%i)\n",i,sort_order(i));
|
||||
copy_op::copy(dst_values,i,src_values,sort_order(i));
|
||||
}
|
||||
};
|
||||
|
||||
typedef typename Space::execution_space execution_space;
|
||||
typedef BinSortOp bin_op_type;
|
||||
|
||||
struct bin_count_tag {};
|
||||
@ -124,84 +177,137 @@ public:
|
||||
struct bin_sort_bins_tag {};
|
||||
|
||||
public:
|
||||
|
||||
typedef SizeType size_type;
|
||||
typedef size_type value_type;
|
||||
|
||||
typedef Kokkos::View<size_type*, execution_space> offset_type;
|
||||
typedef Kokkos::View<const int*, execution_space> bin_count_type;
|
||||
typedef Kokkos::View<size_type*, Space> offset_type;
|
||||
typedef Kokkos::View<const int*, Space> bin_count_type;
|
||||
|
||||
typedef typename KeyViewType::const_type const_key_view_type ;
|
||||
|
||||
typedef Kokkos::View<typename KeyViewType::const_data_type,
|
||||
// If a Kokkos::View then can generate constant random access
|
||||
// otherwise can only use the constant type.
|
||||
|
||||
typedef typename std::conditional
|
||||
< Kokkos::is_view< KeyViewType >::value
|
||||
, Kokkos::View< typename KeyViewType::const_data_type,
|
||||
typename KeyViewType::array_layout,
|
||||
typename KeyViewType::memory_space> const_key_view_type;
|
||||
typedef Kokkos::View<typename KeyViewType::const_data_type,
|
||||
typename KeyViewType::array_layout,
|
||||
typename KeyViewType::memory_space,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> > const_rnd_key_view_type;
|
||||
typename KeyViewType::device_type,
|
||||
Kokkos::MemoryTraits<Kokkos::RandomAccess> >
|
||||
, const_key_view_type
|
||||
>::type const_rnd_key_view_type;
|
||||
|
||||
typedef typename KeyViewType::non_const_value_type non_const_key_scalar;
|
||||
typedef typename KeyViewType::const_value_type const_key_scalar;
|
||||
|
||||
typedef Kokkos::View<int*, Space, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic_type ;
|
||||
|
||||
private:
|
||||
|
||||
const_key_view_type keys;
|
||||
const_rnd_key_view_type keys_rnd;
|
||||
|
||||
public:
|
||||
BinSortOp bin_op;
|
||||
|
||||
offset_type bin_offsets;
|
||||
BinSortOp bin_op ;
|
||||
offset_type bin_offsets ;
|
||||
bin_count_atomic_type bin_count_atomic ;
|
||||
bin_count_type bin_count_const ;
|
||||
offset_type sort_order ;
|
||||
|
||||
Kokkos::View<int*, ExecutionSpace, Kokkos::MemoryTraits<Kokkos::Atomic> > bin_count_atomic;
|
||||
bin_count_type bin_count_const;
|
||||
|
||||
offset_type sort_order;
|
||||
|
||||
bool sort_within_bins;
|
||||
int range_begin ;
|
||||
int range_end ;
|
||||
bool sort_within_bins ;
|
||||
|
||||
public:
|
||||
|
||||
// Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
|
||||
BinSort(const_key_view_type keys_, BinSortOp bin_op_,
|
||||
bool sort_within_bins_ = false)
|
||||
:keys(keys_),keys_rnd(keys_), bin_op(bin_op_) {
|
||||
BinSort() {}
|
||||
|
||||
bin_count_atomic = Kokkos::View<int*, ExecutionSpace >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
|
||||
//----------------------------------------
|
||||
// Constructor: takes the keys, the binning_operator and optionally whether to sort within bins (default false)
|
||||
BinSort( const_key_view_type keys_
|
||||
, int range_begin_
|
||||
, int range_end_
|
||||
, BinSortOp bin_op_
|
||||
, bool sort_within_bins_ = false
|
||||
)
|
||||
: keys(keys_)
|
||||
, keys_rnd(keys_)
|
||||
, bin_op(bin_op_)
|
||||
, bin_offsets()
|
||||
, bin_count_atomic()
|
||||
, bin_count_const()
|
||||
, sort_order()
|
||||
, range_begin( range_begin_ )
|
||||
, range_end( range_end_ )
|
||||
, sort_within_bins( sort_within_bins_ )
|
||||
{
|
||||
bin_count_atomic = Kokkos::View<int*, Space >("Kokkos::SortImpl::BinSortFunctor::bin_count",bin_op.max_bins());
|
||||
bin_count_const = bin_count_atomic;
|
||||
bin_offsets = offset_type("Kokkos::SortImpl::BinSortFunctor::bin_offsets",bin_op.max_bins());
|
||||
sort_order = offset_type("PermutationVector",keys.dimension_0());
|
||||
sort_within_bins = sort_within_bins_;
|
||||
sort_order = offset_type("PermutationVector",range_end-range_begin);
|
||||
}
|
||||
|
||||
BinSort( const_key_view_type keys_
|
||||
, BinSortOp bin_op_
|
||||
, bool sort_within_bins_ = false
|
||||
)
|
||||
: BinSort( keys_ , 0 , keys_.extent(0), bin_op_ , sort_within_bins_ ) {}
|
||||
|
||||
//----------------------------------------
|
||||
// Create the permutation vector, the bin_offset array and the bin_count array. Can be called again if keys changed
|
||||
void create_permute_vector() {
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_count_tag> (0,keys.dimension_0()),*this);
|
||||
Kokkos::parallel_scan(Kokkos::RangePolicy<ExecutionSpace,bin_offset_tag> (0,bin_op.max_bins()) ,*this);
|
||||
const size_t len = range_end - range_begin ;
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_count_tag> (0,len),*this);
|
||||
Kokkos::parallel_scan(Kokkos::RangePolicy<execution_space,bin_offset_tag> (0,bin_op.max_bins()) ,*this);
|
||||
|
||||
Kokkos::deep_copy(bin_count_atomic,0);
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_binning_tag> (0,keys.dimension_0()),*this);
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_binning_tag> (0,len),*this);
|
||||
|
||||
if(sort_within_bins)
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<ExecutionSpace,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
|
||||
Kokkos::parallel_for (Kokkos::RangePolicy<execution_space,bin_sort_bins_tag>(0,bin_op.max_bins()) ,*this);
|
||||
}
|
||||
|
||||
// Sort a view with respect ot the first dimension using the permutation array
|
||||
template<class ValuesViewType>
|
||||
void sort(ValuesViewType values) {
|
||||
ValuesViewType sorted_values = ValuesViewType("Copy",
|
||||
values.dimension_0(),
|
||||
values.dimension_1(),
|
||||
values.dimension_2(),
|
||||
values.dimension_3(),
|
||||
values.dimension_4(),
|
||||
values.dimension_5(),
|
||||
values.dimension_6(),
|
||||
values.dimension_7());
|
||||
void sort( ValuesViewType const & values)
|
||||
{
|
||||
typedef
|
||||
Kokkos::View< typename ValuesViewType::data_type,
|
||||
typename ValuesViewType::array_layout,
|
||||
typename ValuesViewType::device_type >
|
||||
scratch_view_type ;
|
||||
|
||||
parallel_for(values.dimension_0(),
|
||||
bin_sort_sort_functor<ValuesViewType, offset_type,
|
||||
Impl::CopyOp<ValuesViewType> >(values,sorted_values,sort_order));
|
||||
const size_t len = range_end - range_begin ;
|
||||
|
||||
deep_copy(values,sorted_values);
|
||||
scratch_view_type
|
||||
sorted_values("Scratch",
|
||||
len,
|
||||
values.extent(1),
|
||||
values.extent(2),
|
||||
values.extent(3),
|
||||
values.extent(4),
|
||||
values.extent(5),
|
||||
values.extent(6),
|
||||
values.extent(7));
|
||||
|
||||
{
|
||||
copy_permute_functor< scratch_view_type /* DstViewType */
|
||||
, offset_type /* PermuteViewType */
|
||||
, ValuesViewType /* SrcViewType */
|
||||
>
|
||||
functor( sorted_values , sort_order , values );
|
||||
|
||||
parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
|
||||
}
|
||||
|
||||
{
|
||||
copy_functor< ValuesViewType , scratch_view_type >
|
||||
functor( values , range_begin , sorted_values );
|
||||
|
||||
parallel_for( Kokkos::RangePolicy<execution_space>(0,len),functor);
|
||||
}
|
||||
}
|
||||
|
||||
// Get the permutation vector
|
||||
@ -217,9 +323,11 @@ public:
|
||||
bin_count_type get_bin_count() const {return bin_count_const;}
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const bin_count_tag& tag, const int& i) const {
|
||||
bin_count_atomic(bin_op.bin(keys,i))++;
|
||||
const int j = range_begin + i ;
|
||||
bin_count_atomic(bin_op.bin(keys,j))++;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -232,10 +340,11 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator() (const bin_binning_tag& tag, const int& i) const {
|
||||
const int bin = bin_op.bin(keys,i);
|
||||
const int j = range_begin + i ;
|
||||
const int bin = bin_op.bin(keys,j);
|
||||
const int count = bin_count_atomic(bin)++;
|
||||
|
||||
sort_order(bin_offsets(bin) + count) = i;
|
||||
sort_order(bin_offsets(bin) + count) = j ;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
@ -262,13 +371,19 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<class KeyViewType>
|
||||
struct BinOp1D {
|
||||
const int max_bins_;
|
||||
const double mul_;
|
||||
int max_bins_;
|
||||
double mul_;
|
||||
typename KeyViewType::const_value_type range_;
|
||||
typename KeyViewType::const_value_type min_;
|
||||
|
||||
BinOp1D():max_bins_(0),mul_(0.0),
|
||||
range_(typename KeyViewType::const_value_type()),
|
||||
min_(typename KeyViewType::const_value_type()) {}
|
||||
|
||||
//Construct BinOp with number of bins, minimum value and maxuimum value
|
||||
BinOp1D(int max_bins__, typename KeyViewType::const_value_type min,
|
||||
typename KeyViewType::const_value_type max )
|
||||
@ -302,12 +417,14 @@ struct BinOp3D {
|
||||
typename KeyViewType::non_const_value_type range_[3];
|
||||
typename KeyViewType::non_const_value_type min_[3];
|
||||
|
||||
BinOp3D() {}
|
||||
|
||||
BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[],
|
||||
typename KeyViewType::const_value_type max[] )
|
||||
{
|
||||
max_bins_[0] = max_bins__[0]+1;
|
||||
max_bins_[1] = max_bins__[1]+1;
|
||||
max_bins_[2] = max_bins__[2]+1;
|
||||
max_bins_[0] = max_bins__[0];
|
||||
max_bins_[1] = max_bins__[1];
|
||||
max_bins_[2] = max_bins__[2];
|
||||
mul_[0] = 1.0*max_bins__[0]/(max[0]-min[0]);
|
||||
mul_[1] = 1.0*max_bins__[1]/(max[1]-min[1]);
|
||||
mul_[2] = 1.0*max_bins__[2]/(max[2]-min[2]);
|
||||
@ -364,7 +481,7 @@ bool try_std_sort(ViewType view) {
|
||||
possible = possible && (ViewType::Rank == 1);
|
||||
possible = possible && (stride[0] == 1);
|
||||
if(possible) {
|
||||
std::sort(view.ptr_on_device(),view.ptr_on_device()+view.dimension_0());
|
||||
std::sort(view.data(),view.data()+view.extent(0));
|
||||
}
|
||||
return possible;
|
||||
}
|
||||
@ -386,7 +503,8 @@ struct min_max_functor {
|
||||
}
|
||||
|
||||
template<class ViewType>
|
||||
void sort(ViewType view, bool always_use_kokkos_sort = false) {
|
||||
void sort( ViewType const & view , bool const always_use_kokkos_sort = false)
|
||||
{
|
||||
if(!always_use_kokkos_sort) {
|
||||
if(Impl::try_std_sort(view)) return;
|
||||
}
|
||||
@ -394,14 +512,37 @@ void sort(ViewType view, bool always_use_kokkos_sort = false) {
|
||||
|
||||
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
|
||||
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
|
||||
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.dimension_0()),
|
||||
parallel_reduce(Kokkos::RangePolicy<typename ViewType::execution_space>(0,view.extent(0)),
|
||||
Impl::min_max_functor<ViewType>(view),reducer);
|
||||
if(result.min_val == result.max_val) return;
|
||||
BinSort<ViewType, CompType> bin_sort(view,CompType(view.dimension_0()/2,result.min_val,result.max_val),true);
|
||||
BinSort<ViewType, CompType> bin_sort(view,CompType(view.extent(0)/2,result.min_val,result.max_val),true);
|
||||
bin_sort.create_permute_vector();
|
||||
bin_sort.sort(view);
|
||||
}
|
||||
|
||||
template<class ViewType>
|
||||
void sort( ViewType view
|
||||
, size_t const begin
|
||||
, size_t const end
|
||||
)
|
||||
{
|
||||
typedef Kokkos::RangePolicy<typename ViewType::execution_space> range_policy ;
|
||||
typedef BinOp1D<ViewType> CompType;
|
||||
|
||||
Kokkos::Experimental::MinMaxScalar<typename ViewType::non_const_value_type> result;
|
||||
Kokkos::Experimental::MinMax<typename ViewType::non_const_value_type> reducer(result);
|
||||
|
||||
parallel_reduce( range_policy( begin , end )
|
||||
, Impl::min_max_functor<ViewType>(view),reducer );
|
||||
|
||||
if(result.min_val == result.max_val) return;
|
||||
|
||||
BinSort<ViewType, CompType>
|
||||
bin_sort(view,begin,end,CompType((end-begin)/2,result.min_val,result.max_val),true);
|
||||
|
||||
bin_sort.create_permute_vector();
|
||||
bin_sort.sort(view);
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
@ -44,6 +44,7 @@
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
#include<Kokkos_Core.hpp>
|
||||
#include<Kokkos_DynamicView.hpp>
|
||||
#include<Kokkos_Random.hpp>
|
||||
#include<Kokkos_Sort.hpp>
|
||||
|
||||
@ -192,17 +193,81 @@ void test_3D_sort(unsigned int n) {
|
||||
double epsilon = 1e-10;
|
||||
unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
|
||||
|
||||
if ( sort_fails )
|
||||
printf("3D Sort Sum: %f %f Fails: %u\n",sum_before,sum_after,sort_fails);
|
||||
|
||||
ASSERT_EQ(sort_fails,0);
|
||||
ASSERT_EQ(equal_sum,1);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<class ExecutionSpace, typename KeyType>
|
||||
void test_dynamic_view_sort(unsigned int n )
|
||||
{
|
||||
typedef typename ExecutionSpace::memory_space memory_space ;
|
||||
typedef Kokkos::Experimental::DynamicView<KeyType*,ExecutionSpace> KeyDynamicViewType;
|
||||
typedef Kokkos::View<KeyType*,ExecutionSpace> KeyViewType;
|
||||
|
||||
const size_t upper_bound = 2 * n ;
|
||||
|
||||
typename KeyDynamicViewType::memory_pool
|
||||
pool( memory_space() , 2 * n * sizeof(KeyType) );
|
||||
|
||||
KeyDynamicViewType keys("Keys",pool,upper_bound);
|
||||
|
||||
keys.resize_serial(n);
|
||||
|
||||
KeyViewType keys_view("KeysTmp", n );
|
||||
|
||||
// Test sorting array with all numbers equal
|
||||
Kokkos::deep_copy(keys_view,KeyType(1));
|
||||
Kokkos::Experimental::deep_copy(keys,keys_view);
|
||||
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
|
||||
|
||||
Kokkos::Random_XorShift64_Pool<ExecutionSpace> g(1931);
|
||||
Kokkos::fill_random(keys_view,g,Kokkos::Random_XorShift64_Pool<ExecutionSpace>::generator_type::MAX_URAND);
|
||||
|
||||
Kokkos::Experimental::deep_copy(keys,keys_view);
|
||||
|
||||
double sum_before = 0.0;
|
||||
double sum_after = 0.0;
|
||||
unsigned int sort_fails = 0;
|
||||
|
||||
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_before);
|
||||
|
||||
Kokkos::sort(keys, 0 /* begin */ , n /* end */ );
|
||||
|
||||
Kokkos::Experimental::deep_copy( keys_view , keys );
|
||||
|
||||
Kokkos::parallel_reduce(n,sum<ExecutionSpace, KeyType>(keys_view),sum_after);
|
||||
Kokkos::parallel_reduce(n-1,is_sorted_struct<ExecutionSpace, KeyType>(keys_view),sort_fails);
|
||||
|
||||
double ratio = sum_before/sum_after;
|
||||
double epsilon = 1e-10;
|
||||
unsigned int equal_sum = (ratio > (1.0-epsilon)) && (ratio < (1.0+epsilon)) ? 1 : 0;
|
||||
|
||||
if ( sort_fails != 0 || equal_sum != 1 ) {
|
||||
std::cout << " N = " << n
|
||||
<< " ; sum_before = " << sum_before
|
||||
<< " ; sum_after = " << sum_after
|
||||
<< " ; ratio = " << ratio
|
||||
<< std::endl ;
|
||||
}
|
||||
|
||||
ASSERT_EQ(sort_fails,0);
|
||||
ASSERT_EQ(equal_sum,1);
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<class ExecutionSpace, typename KeyType>
|
||||
void test_sort(unsigned int N)
|
||||
{
|
||||
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, true);
|
||||
test_1D_sort<ExecutionSpace,KeyType>(N*N*N, false);
|
||||
test_3D_sort<ExecutionSpace,KeyType>(N);
|
||||
test_dynamic_view_sort<ExecutionSpace,KeyType>(N*N);
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
@ -140,6 +140,9 @@ do
|
||||
#strip of pedantic because it produces endless warnings about #LINE added by the preprocessor
|
||||
-pedantic|-Wpedantic|-ansi)
|
||||
;;
|
||||
#strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C"
|
||||
-Woverloaded-virtual)
|
||||
;;
|
||||
#strip -Xcompiler because we add it
|
||||
-Xcompiler)
|
||||
if [ $first_xcompiler_arg -eq 1 ]; then
|
||||
@ -190,7 +193,7 @@ do
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
|
||||
*.dylib)
|
||||
@*|*.dylib)
|
||||
object_files="$object_files -Xlinker $1"
|
||||
object_files_xlinker="$object_files_xlinker -Xlinker $1"
|
||||
;;
|
||||
|
||||
@ -63,8 +63,7 @@
|
||||
# Source: https://code.google.com/p/qthreads
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
|
||||
REQUIRED_HEADERS qthread.h
|
||||
REQUIRED_LIBS_NAMES "qthread"
|
||||
)
|
||||
|
||||
@ -63,8 +63,7 @@
|
||||
# Source: https://code.google.com/p/qthreads
|
||||
#
|
||||
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
|
||||
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREADS
|
||||
REQUIRED_HEADERS qthread.h
|
||||
REQUIRED_LIBS_NAMES "qthread"
|
||||
)
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
#-----------------------------------------------------------------------------
|
||||
# Building on 'kokkos-dev.sandia.gov' with enabled capabilities:
|
||||
#
|
||||
# Cuda, OpenMP, Threads, Qthread, hwloc
|
||||
# Cuda, OpenMP, Threads, Qthreads, hwloc
|
||||
#
|
||||
# module loaded on 'kokkos-dev.sandia.gov' for this build
|
||||
#
|
||||
@ -82,13 +82,13 @@ CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Trilinos_ENABLE_OpenMP:BOOL=ON"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D Kokkos_ENABLE_OpenMP:BOOL=ON"
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# Qthread
|
||||
# Qthreads
|
||||
|
||||
QTHREAD_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
|
||||
QTHREADS_BASE_DIR="/home/projects/qthreads/2014-07-08/host/gnu/4.7.3"
|
||||
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREAD:BOOL=ON"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_INCLUDE_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/include"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREAD_LIBRARY_DIRS:FILEPATH=${QTHREAD_BASE_DIR}/lib"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D TPL_ENABLE_QTHREADS:BOOL=ON"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_INCLUDE_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/include"
|
||||
CMAKE_CONFIGURE="${CMAKE_CONFIGURE} -D QTHREADS_LIBRARY_DIRS:FILEPATH=${QTHREADS_BASE_DIR}/lib"
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
# C++11
|
||||
@ -108,6 +108,3 @@ rm -rf CMake* Trilinos* packages Dart* Testing cmake_install.cmake MakeFile*
|
||||
echo cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
|
||||
|
||||
cmake ${CMAKE_CONFIGURE} ${TRILINOS_SOURCE_DIR}
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
|
||||
@ -5,3 +5,4 @@ tag: 2.02.00 date: 10:30:2016 master: 6c90a581 develop: ca3dd56e
|
||||
tag: 2.02.01 date: 11:01:2016 master: 9c698c86 develop: b0072304
|
||||
tag: 2.02.07 date: 12:16:2016 master: 4b4cc4ba develop: 382c0966
|
||||
tag: 2.02.15 date: 02:10:2017 master: 8c64cd93 develop: 28dea8b6
|
||||
tag: 2.03.00 date: 04:25:2017 master: 120d9ce7 develop: 015ba641
|
||||
|
||||
@ -6,7 +6,7 @@
|
||||
|
||||
set -o pipefail
|
||||
|
||||
# Determine current machine
|
||||
# Determine current machine.
|
||||
|
||||
MACHINE=""
|
||||
HOSTNAME=$(hostname)
|
||||
@ -45,10 +45,11 @@ CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limi
|
||||
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
|
||||
CUDA_WARNING_FLAGS=""
|
||||
|
||||
# Default. Machine specific can override
|
||||
# Default. Machine specific can override.
|
||||
DEBUG=False
|
||||
ARGS=""
|
||||
CUSTOM_BUILD_LIST=""
|
||||
QTHREADS_PATH=""
|
||||
DRYRUN=False
|
||||
BUILD_ONLY=False
|
||||
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
|
||||
@ -60,74 +61,78 @@ PRINT_HELP=False
|
||||
OPT_FLAG=""
|
||||
KOKKOS_OPTIONS=""
|
||||
|
||||
|
||||
#
|
||||
# Handle arguments
|
||||
# Handle arguments.
|
||||
#
|
||||
|
||||
while [[ $# > 0 ]]
|
||||
do
|
||||
key="$1"
|
||||
case $key in
|
||||
--kokkos-path*)
|
||||
KOKKOS_PATH="${key#*=}"
|
||||
;;
|
||||
--build-list*)
|
||||
CUSTOM_BUILD_LIST="${key#*=}"
|
||||
;;
|
||||
--debug*)
|
||||
DEBUG=True
|
||||
;;
|
||||
--build-only*)
|
||||
BUILD_ONLY=True
|
||||
;;
|
||||
--test-script*)
|
||||
TEST_SCRIPT=True
|
||||
;;
|
||||
--skip-hwloc*)
|
||||
SKIP_HWLOC=True
|
||||
;;
|
||||
--num*)
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
|
||||
;;
|
||||
--dry-run*)
|
||||
DRYRUN=True
|
||||
;;
|
||||
--spot-check*)
|
||||
SPOT_CHECK=True
|
||||
;;
|
||||
--arch*)
|
||||
ARCH_FLAG="--arch=${key#*=}"
|
||||
;;
|
||||
--opt-flag*)
|
||||
OPT_FLAG="${key#*=}"
|
||||
;;
|
||||
--with-cuda-options*)
|
||||
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
|
||||
;;
|
||||
--help*)
|
||||
PRINT_HELP=True
|
||||
;;
|
||||
*)
|
||||
# args, just append
|
||||
ARGS="$ARGS $1"
|
||||
;;
|
||||
esac
|
||||
shift
|
||||
key="$1"
|
||||
|
||||
case $key in
|
||||
--kokkos-path*)
|
||||
KOKKOS_PATH="${key#*=}"
|
||||
;;
|
||||
--qthreads-path*)
|
||||
QTHREADS_PATH="${key#*=}"
|
||||
;;
|
||||
--build-list*)
|
||||
CUSTOM_BUILD_LIST="${key#*=}"
|
||||
;;
|
||||
--debug*)
|
||||
DEBUG=True
|
||||
;;
|
||||
--build-only*)
|
||||
BUILD_ONLY=True
|
||||
;;
|
||||
--test-script*)
|
||||
TEST_SCRIPT=True
|
||||
;;
|
||||
--skip-hwloc*)
|
||||
SKIP_HWLOC=True
|
||||
;;
|
||||
--num*)
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
|
||||
;;
|
||||
--dry-run*)
|
||||
DRYRUN=True
|
||||
;;
|
||||
--spot-check*)
|
||||
SPOT_CHECK=True
|
||||
;;
|
||||
--arch*)
|
||||
ARCH_FLAG="--arch=${key#*=}"
|
||||
;;
|
||||
--opt-flag*)
|
||||
OPT_FLAG="${key#*=}"
|
||||
;;
|
||||
--with-cuda-options*)
|
||||
KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}"
|
||||
;;
|
||||
--help*)
|
||||
PRINT_HELP=True
|
||||
;;
|
||||
*)
|
||||
# args, just append
|
||||
ARGS="$ARGS $1"
|
||||
;;
|
||||
esac
|
||||
|
||||
shift
|
||||
done
|
||||
|
||||
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
|
||||
|
||||
# set kokkos path
|
||||
# Set kokkos path.
|
||||
if [ -z "$KOKKOS_PATH" ]; then
|
||||
KOKKOS_PATH=$SCRIPT_KOKKOS_ROOT
|
||||
else
|
||||
# Ensure KOKKOS_PATH is abs path
|
||||
# Ensure KOKKOS_PATH is abs path.
|
||||
KOKKOS_PATH=$( cd $KOKKOS_PATH && pwd )
|
||||
fi
|
||||
|
||||
#
|
||||
# Machine specific config
|
||||
# Machine specific config.
|
||||
#
|
||||
|
||||
if [ "$MACHINE" = "sems" ]; then
|
||||
@ -153,21 +158,17 @@ if [ "$MACHINE" = "sems" ]; then
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
|
||||
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
|
||||
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.7.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.8.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"clang/3.9.0 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
|
||||
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA8_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
fi
|
||||
|
||||
elif [ "$MACHINE" = "white" ]; then
|
||||
source /etc/profile.d/modules.sh
|
||||
SKIP_HWLOC=True
|
||||
@ -177,7 +178,7 @@ elif [ "$MACHINE" = "white" ]; then
|
||||
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
|
||||
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/5.4.0"
|
||||
|
||||
# Don't do pthread on white
|
||||
# Don't do pthread on white.
|
||||
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
|
||||
|
||||
# Format: (compiler module-list build-list exe-name warning-flag)
|
||||
@ -185,9 +186,11 @@ elif [ "$MACHINE" = "white" ]; then
|
||||
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
|
||||
"cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
|
||||
)
|
||||
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=Power8,Kepler37"
|
||||
fi
|
||||
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
|
||||
elif [ "$MACHINE" = "bowman" ]; then
|
||||
@ -300,14 +303,14 @@ elif [ "$MACHINE" = "apollo" ]; then
|
||||
if [ -z "$ARCH_FLAG" ]; then
|
||||
ARCH_FLAG="--arch=SNB,Kepler35"
|
||||
fi
|
||||
|
||||
NUM_JOBS_TO_RUN_IN_PARALLEL=2
|
||||
|
||||
else
|
||||
echo "Unhandled machine $MACHINE" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
||||
|
||||
export OMP_NUM_THREADS=4
|
||||
|
||||
declare -i NUM_RESULTS_TO_KEEP=7
|
||||
@ -315,76 +318,78 @@ declare -i NUM_RESULTS_TO_KEEP=7
|
||||
RESULT_ROOT_PREFIX=TestAll
|
||||
|
||||
if [ "$PRINT_HELP" = "True" ]; then
|
||||
echo "test_all_sandia <ARGS> <OPTIONS>:"
|
||||
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
|
||||
echo " Defaults to root repo containing this script"
|
||||
echo "--debug: Run tests in debug. Defaults to False"
|
||||
echo "--test-script: Test this script, not Kokkos"
|
||||
echo "--skip-hwloc: Do not do hwloc tests"
|
||||
echo "--num=N: Number of jobs to run in parallel"
|
||||
echo "--spot-check: Minimal test set to issue pull request"
|
||||
echo "--dry-run: Just print what would be executed"
|
||||
echo "--build-only: Just do builds, don't run anything"
|
||||
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
|
||||
echo "--arch=ARCHITECTURE: overwrite architecture flags"
|
||||
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
|
||||
echo "--build-list=BUILD,BUILD,BUILD..."
|
||||
echo " Provide a comma-separated list of builds instead of running all builds"
|
||||
echo " Valid items:"
|
||||
echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
|
||||
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
|
||||
echo ""
|
||||
echo "test_all_sandia <ARGS> <OPTIONS>:"
|
||||
echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
|
||||
echo " Defaults to root repo containing this script"
|
||||
echo "--debug: Run tests in debug. Defaults to False"
|
||||
echo "--test-script: Test this script, not Kokkos"
|
||||
echo "--skip-hwloc: Do not do hwloc tests"
|
||||
echo "--num=N: Number of jobs to run in parallel"
|
||||
echo "--spot-check: Minimal test set to issue pull request"
|
||||
echo "--dry-run: Just print what would be executed"
|
||||
echo "--build-only: Just do builds, don't run anything"
|
||||
echo "--opt-flag=FLAG: Optimization flag (default: -O3)"
|
||||
echo "--arch=ARCHITECTURE: overwrite architecture flags"
|
||||
echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS"
|
||||
echo "--build-list=BUILD,BUILD,BUILD..."
|
||||
echo " Provide a comma-separated list of builds instead of running all builds"
|
||||
echo " Valid items:"
|
||||
echo " OpenMP, Pthread, Qthreads, Serial, OpenMP_Serial, Pthread_Serial"
|
||||
echo " Qthreads_Serial, Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
|
||||
echo ""
|
||||
|
||||
echo "ARGS: list of expressions matching compilers to test"
|
||||
echo " supported compilers sems"
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
echo "ARGS: list of expressions matching compilers to test"
|
||||
echo " supported compilers sems"
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
ARR=($COMPILER_DATA)
|
||||
COMPILER=${ARR[0]}
|
||||
echo " $COMPILER"
|
||||
done
|
||||
echo ""
|
||||
done
|
||||
echo ""
|
||||
|
||||
echo "Examples:"
|
||||
echo " Run all tests"
|
||||
echo " % test_all_sandia"
|
||||
echo ""
|
||||
echo " Run all gcc tests"
|
||||
echo " % test_all_sandia gcc"
|
||||
echo ""
|
||||
echo " Run all gcc/4.7.2 and all intel tests"
|
||||
echo " % test_all_sandia gcc/4.7.2 intel"
|
||||
echo ""
|
||||
echo " Run all tests in debug"
|
||||
echo " % test_all_sandia --debug"
|
||||
echo ""
|
||||
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
|
||||
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
|
||||
echo ""
|
||||
echo "If you want to kill the tests, do:"
|
||||
echo " hit ctrl-z"
|
||||
echo " % kill -9 %1"
|
||||
echo
|
||||
exit 0
|
||||
echo "Examples:"
|
||||
echo " Run all tests"
|
||||
echo " % test_all_sandia"
|
||||
echo ""
|
||||
echo " Run all gcc tests"
|
||||
echo " % test_all_sandia gcc"
|
||||
echo ""
|
||||
echo " Run all gcc/4.7.2 and all intel tests"
|
||||
echo " % test_all_sandia gcc/4.7.2 intel"
|
||||
echo ""
|
||||
echo " Run all tests in debug"
|
||||
echo " % test_all_sandia --debug"
|
||||
echo ""
|
||||
echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds"
|
||||
echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial"
|
||||
echo ""
|
||||
echo "If you want to kill the tests, do:"
|
||||
echo " hit ctrl-z"
|
||||
echo " % kill -9 %1"
|
||||
echo
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# set build type
|
||||
# Set build type.
|
||||
if [ "$DEBUG" = "True" ]; then
|
||||
BUILD_TYPE=debug
|
||||
else
|
||||
BUILD_TYPE=release
|
||||
fi
|
||||
|
||||
# If no args provided, do all compilers
|
||||
# If no args provided, do all compilers.
|
||||
if [ -z "$ARGS" ]; then
|
||||
ARGS='?'
|
||||
fi
|
||||
|
||||
# Process args to figure out which compilers to test
|
||||
# Process args to figure out which compilers to test.
|
||||
COMPILERS_TO_TEST=""
|
||||
|
||||
for ARG in $ARGS; do
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
ARR=($COMPILER_DATA)
|
||||
COMPILER=${ARR[0]}
|
||||
|
||||
if [[ "$COMPILER" = $ARG* ]]; then
|
||||
if [[ "$COMPILERS_TO_TEST" != *${COMPILER}* ]]; then
|
||||
COMPILERS_TO_TEST="$COMPILERS_TO_TEST $COMPILER"
|
||||
@ -395,8 +400,35 @@ for ARG in $ARGS; do
|
||||
done
|
||||
done
|
||||
|
||||
# Check if Qthreads build requested.
|
||||
HAVE_QTHREADS_BUILD="False"
|
||||
if [ -n "$CUSTOM_BUILD_LIST" ]; then
|
||||
if [[ "$CUSTOM_BUILD_LIST" = *Qthreads* ]]; then
|
||||
HAVE_QTHREADS_BUILD="True"
|
||||
fi
|
||||
else
|
||||
for COMPILER_DATA in "${COMPILERS[@]}"; do
|
||||
ARR=($COMPILER_DATA)
|
||||
BUILD_LIST=${ARR[2]}
|
||||
if [[ "$BUILD_LIST" = *Qthreads* ]]; then
|
||||
HAVE_QTHREADS_BUILD="True"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
|
||||
# Ensure Qthreads path is set if Qthreads build is requested.
|
||||
if [ "$HAVE_QTHREADS_BUILD" = "True" ]; then
|
||||
if [ -z "$QTHREADS_PATH" ]; then
|
||||
echo "Need to supply Qthreads path (--qthreads-path) when testing Qthreads backend." >&2
|
||||
exit 1
|
||||
else
|
||||
# Strip trailing slashes from path.
|
||||
QTHREADS_PATH=$(echo $QTHREADS_PATH | sed 's/\/*$//')
|
||||
fi
|
||||
fi
|
||||
|
||||
#
|
||||
# Functions
|
||||
# Functions.
|
||||
#
|
||||
|
||||
# get_compiler_name <COMPILER>
|
||||
@ -409,7 +441,7 @@ get_compiler_version() {
|
||||
echo $1 | cut -d/ -f2
|
||||
}
|
||||
|
||||
# Do not call directly
|
||||
# Do not call directly.
|
||||
get_compiler_data() {
|
||||
local compiler=$1
|
||||
local item=$2
|
||||
@ -419,13 +451,14 @@ get_compiler_data() {
|
||||
local compiler_data
|
||||
for compiler_data in "${COMPILERS[@]}" ; do
|
||||
local arr=($compiler_data)
|
||||
|
||||
if [ "$compiler" = "${arr[0]}" ]; then
|
||||
echo "${arr[$item]}" | tr , ' ' | sed -e "s/<COMPILER_NAME>/$compiler_name/g" -e "s/<COMPILER_VERSION>/$compiler_vers/g"
|
||||
return 0
|
||||
fi
|
||||
done
|
||||
|
||||
# Not found
|
||||
# Not found.
|
||||
echo "Unreconized compiler $compiler" >&2
|
||||
exit 1
|
||||
}
|
||||
@ -459,14 +492,14 @@ run_cmd() {
|
||||
|
||||
# report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
|
||||
report_and_log_test_result() {
|
||||
# Use sane var names
|
||||
# Use sane var names.
|
||||
local success=$1; local desc=$2; local comment=$3;
|
||||
|
||||
if [ "$success" = "0" ]; then
|
||||
echo " PASSED $desc"
|
||||
echo $comment > $PASSED_DIR/$desc
|
||||
else
|
||||
# For failures, comment should be the name of the phase that failed
|
||||
# For failures, comment should be the name of the phase that failed.
|
||||
echo " FAILED $desc" >&2
|
||||
echo $comment > $FAILED_DIR/$desc
|
||||
cat ${desc}.${comment}.log
|
||||
@ -494,16 +527,16 @@ setup_env() {
|
||||
|
||||
# single_build_and_test <COMPILER> <BUILD> <BUILD_TYPE>
|
||||
single_build_and_test() {
|
||||
# Use sane var names
|
||||
# Use sane var names.
|
||||
local compiler=$1; local build=$2; local build_type=$3;
|
||||
|
||||
# set up env
|
||||
# Set up env.
|
||||
mkdir -p $ROOT_DIR/$compiler/"${build}-$build_type"
|
||||
cd $ROOT_DIR/$compiler/"${build}-$build_type"
|
||||
local desc=$(echo "${compiler}-${build}-${build_type}" | sed 's:/:-:g')
|
||||
setup_env $compiler >& ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
|
||||
|
||||
# Set up flags
|
||||
# Set up flags.
|
||||
local compiler_warning_flags=$(get_compiler_warning_flags $compiler)
|
||||
local compiler_exe=$(get_compiler_exe_name $compiler)
|
||||
|
||||
@ -511,6 +544,14 @@ single_build_and_test() {
|
||||
local extra_args=--with-hwloc=$(dirname $(dirname $(which hwloc-info)))
|
||||
fi
|
||||
|
||||
if [[ "$build" = *Qthreads* ]]; then
|
||||
if [[ "$build_type" = hwloc* ]]; then
|
||||
local extra_args="$extra_args --qthreads-path=${QTHREADS_PATH}_hwloc"
|
||||
else
|
||||
local extra_args="$extra_args --qthreads-path=$QTHREADS_PATH"
|
||||
fi
|
||||
fi
|
||||
|
||||
if [[ "$OPT_FLAG" = "" ]]; then
|
||||
OPT_FLAG="-O3"
|
||||
fi
|
||||
@ -522,11 +563,6 @@ single_build_and_test() {
|
||||
local cxxflags="$OPT_FLAG $compiler_warning_flags"
|
||||
fi
|
||||
|
||||
if [[ "$compiler" == cuda* ]]; then
|
||||
cxxflags="--keep --keep-dir=$(pwd) $cxxflags"
|
||||
export TMPDIR=$(pwd)
|
||||
fi
|
||||
|
||||
if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then
|
||||
local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS"
|
||||
fi
|
||||
@ -538,6 +574,7 @@ single_build_and_test() {
|
||||
if [ "$TEST_SCRIPT" = "True" ]; then
|
||||
local rand=$[ 1 + $[ RANDOM % 10 ]]
|
||||
sleep $rand
|
||||
|
||||
if [ $rand -gt 5 ]; then
|
||||
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
|
||||
fi
|
||||
@ -547,6 +584,7 @@ single_build_and_test() {
|
||||
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
|
||||
local -i build_end_time=$(date +%s)
|
||||
comment="build_time=$(($build_end_time-$build_start_time))"
|
||||
|
||||
if [[ "$BUILD_ONLY" == False ]]; then
|
||||
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
|
||||
local -i run_end_time=$(date +%s)
|
||||
@ -576,7 +614,7 @@ run_in_background() {
|
||||
local compiler=$1
|
||||
|
||||
local -i num_jobs=$NUM_JOBS_TO_RUN_IN_PARALLEL
|
||||
# don't override command line input
|
||||
# Don't override command line input.
|
||||
# if [[ "$BUILD_ONLY" == True ]]; then
|
||||
# num_jobs=8
|
||||
# else
|
||||
@ -591,7 +629,7 @@ run_in_background() {
|
||||
|
||||
# build_and_test_all <COMPILER>
|
||||
build_and_test_all() {
|
||||
# Get compiler data
|
||||
# Get compiler data.
|
||||
local compiler=$1
|
||||
if [ -z "$CUSTOM_BUILD_LIST" ]; then
|
||||
local compiler_build_list=$(get_compiler_build_list $compiler)
|
||||
@ -599,13 +637,13 @@ build_and_test_all() {
|
||||
local compiler_build_list=$(echo "$CUSTOM_BUILD_LIST" | tr , ' ')
|
||||
fi
|
||||
|
||||
# do builds
|
||||
# Do builds.
|
||||
local build
|
||||
for build in $compiler_build_list
|
||||
do
|
||||
run_in_background $compiler $build $BUILD_TYPE
|
||||
|
||||
# If not cuda, do a hwloc test too
|
||||
# If not cuda, do a hwloc test too.
|
||||
if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
|
||||
run_in_background $compiler $build "hwloc-$BUILD_TYPE"
|
||||
fi
|
||||
@ -655,7 +693,7 @@ wait_summarize_and_exit() {
|
||||
}
|
||||
|
||||
#
|
||||
# Main
|
||||
# Main.
|
||||
#
|
||||
|
||||
ROOT_DIR=$(get_test_root_dir)
|
||||
|
||||
@ -60,7 +60,7 @@ class DynamicView : public Kokkos::ViewTraits< DataType , P ... >
|
||||
{
|
||||
public:
|
||||
|
||||
typedef ViewTraits< DataType , P ... > traits ;
|
||||
typedef Kokkos::ViewTraits< DataType , P ... > traits ;
|
||||
|
||||
private:
|
||||
|
||||
@ -123,30 +123,41 @@ public:
|
||||
|
||||
enum { Rank = 1 };
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t size() const
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_t size() const noexcept
|
||||
{
|
||||
return
|
||||
Kokkos::Impl::MemorySpaceAccess
|
||||
uintptr_t n = 0 ;
|
||||
|
||||
if ( Kokkos::Impl::MemorySpaceAccess
|
||||
< Kokkos::Impl::ActiveExecutionMemorySpace
|
||||
, typename traits::memory_space
|
||||
>::accessible
|
||||
? // Runtime size is at the end of the chunk pointer array
|
||||
(*reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max ))
|
||||
<< m_chunk_shift
|
||||
: 0 ;
|
||||
>::accessible ) {
|
||||
n = *reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max );
|
||||
}
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
else {
|
||||
Kokkos::Impl::DeepCopy< Kokkos::HostSpace
|
||||
, typename traits::memory_space
|
||||
, Kokkos::HostSpace::execution_space >
|
||||
( & n
|
||||
, reinterpret_cast<const uintptr_t*>( m_chunks + m_chunk_max )
|
||||
, sizeof(uintptr_t) );
|
||||
}
|
||||
#endif
|
||||
return n << m_chunk_shift ;
|
||||
}
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_t extent( const iType & r ) const
|
||||
{ return r == 0 ? size() : 1 ; }
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION constexpr
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_t extent_int( const iType & r ) const
|
||||
{ return r == 0 ? size() : 1 ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_0() const { return size(); }
|
||||
KOKKOS_INLINE_FUNCTION size_t dimension_0() const { return size(); }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_1() const { return 1 ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_2() const { return 1 ; }
|
||||
KOKKOS_INLINE_FUNCTION constexpr size_t dimension_3() const { return 1 ; }
|
||||
@ -270,10 +281,18 @@ public:
|
||||
}
|
||||
|
||||
/** \brief Resizing in serial can grow or shrink the array size, */
|
||||
template< typename IntType >
|
||||
inline
|
||||
void resize_serial( size_t n )
|
||||
typename std::enable_if
|
||||
< std::is_integral<IntType>::value &&
|
||||
Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
|
||||
, typename traits::memory_space
|
||||
>::accessible
|
||||
>::type
|
||||
resize_serial( IntType const & n )
|
||||
{
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
typedef typename traits::value_type value_type ;
|
||||
typedef value_type * pointer_type ;
|
||||
|
||||
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
|
||||
|
||||
@ -286,8 +305,8 @@ public:
|
||||
|
||||
if ( *pc < NC ) {
|
||||
while ( *pc < NC ) {
|
||||
m_chunks[*pc] =
|
||||
m_pool.allocate( sizeof(traits::value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = reinterpret_cast<pointer_type>
|
||||
( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
|
||||
++*pc ;
|
||||
}
|
||||
}
|
||||
@ -295,12 +314,90 @@ public:
|
||||
while ( NC + 1 <= *pc ) {
|
||||
--*pc ;
|
||||
m_pool.deallocate( m_chunks[*pc]
|
||||
, sizeof(traits::value_type) << m_chunk_shift );
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*pc] = 0 ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
struct ResizeSerial {
|
||||
memory_pool m_pool ;
|
||||
typename traits::value_type ** m_chunks ;
|
||||
uintptr_t * m_pc ;
|
||||
uintptr_t m_nc ;
|
||||
unsigned m_chunk_shift ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()( int ) const
|
||||
{
|
||||
typedef typename traits::value_type value_type ;
|
||||
typedef value_type * pointer_type ;
|
||||
|
||||
if ( *m_pc < m_nc ) {
|
||||
while ( *m_pc < m_nc ) {
|
||||
m_chunks[*m_pc] = reinterpret_cast<pointer_type>
|
||||
( m_pool.allocate( sizeof(value_type) << m_chunk_shift ) );
|
||||
++*m_pc ;
|
||||
}
|
||||
}
|
||||
else {
|
||||
while ( m_nc + 1 <= *m_pc ) {
|
||||
--*m_pc ;
|
||||
m_pool.deallocate( m_chunks[*m_pc]
|
||||
, sizeof(value_type) << m_chunk_shift );
|
||||
m_chunks[*m_pc] = 0 ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
ResizeSerial( memory_pool const & arg_pool
|
||||
, typename traits::value_type ** arg_chunks
|
||||
, uintptr_t * arg_pc
|
||||
, uintptr_t arg_nc
|
||||
, unsigned arg_chunk_shift
|
||||
)
|
||||
: m_pool( arg_pool )
|
||||
, m_chunks( arg_chunks )
|
||||
, m_pc( arg_pc )
|
||||
, m_nc( arg_nc )
|
||||
, m_chunk_shift( arg_chunk_shift )
|
||||
{}
|
||||
};
|
||||
|
||||
template< typename IntType >
|
||||
inline
|
||||
typename std::enable_if
|
||||
< std::is_integral<IntType>::value &&
|
||||
! Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace
|
||||
, typename traits::memory_space
|
||||
>::accessible
|
||||
>::type
|
||||
resize_serial( IntType const & n )
|
||||
{
|
||||
const uintptr_t NC = ( n + m_chunk_mask ) >> m_chunk_shift ;
|
||||
|
||||
if ( m_chunk_max < NC ) {
|
||||
Kokkos::abort("DynamicView::resize_serial exceeded maximum size");
|
||||
}
|
||||
|
||||
// Must dispatch kernel
|
||||
|
||||
typedef Kokkos::RangePolicy< typename traits::execution_space > Range ;
|
||||
|
||||
uintptr_t * const pc =
|
||||
reinterpret_cast<uintptr_t*>( m_chunks + m_chunk_max );
|
||||
|
||||
Kokkos::Impl::ParallelFor<ResizeSerial,Range>
|
||||
closure( ResizeSerial( m_pool, m_chunks, pc, NC, m_chunk_shift )
|
||||
, Range(0,1) );
|
||||
|
||||
closure.execute();
|
||||
|
||||
traits::execution_space::fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
|
||||
~DynamicView() = default ;
|
||||
@ -311,15 +408,17 @@ public:
|
||||
DynamicView & operator = ( const DynamicView & ) = default ;
|
||||
|
||||
template< class RT , class ... RP >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
DynamicView( const DynamicView<RT,RP...> & rhs )
|
||||
: m_pool( rhs.m_pool )
|
||||
, m_track( rhs.m_track )
|
||||
, m_chunks( rhs.m_chunks )
|
||||
, m_chunks( (typename traits::value_type **) rhs.m_chunks )
|
||||
, m_chunk_shift( rhs.m_chunk_shift )
|
||||
, m_chunk_mask( rhs.m_chunk_mask )
|
||||
, m_chunk_max( rhs.m_chunk_max )
|
||||
{
|
||||
typedef typename DynamicView<RT,RP...>::traits SrcTraits ;
|
||||
typedef Kokkos::Impl::ViewMapping< traits , SrcTraits , void > Mapping ;
|
||||
static_assert( Mapping::is_assignable , "Incompatible DynamicView copy construction" );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------
|
||||
@ -400,8 +499,6 @@ public:
|
||||
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
|
||||
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
|
||||
{
|
||||
DynamicView::template verify_space< Kokkos::Impl::ActiveExecutionMemorySpace >::check();
|
||||
|
||||
// A functor to deallocate all of the chunks upon final destruction
|
||||
|
||||
typedef typename traits::memory_space memory_space ;
|
||||
|
||||
@ -230,16 +230,17 @@ public:
|
||||
typedef typename Impl::remove_const<declared_value_type>::type value_type;
|
||||
typedef typename Impl::add_const<value_type>::type const_value_type;
|
||||
|
||||
typedef Device execution_space;
|
||||
typedef Device device_type;
|
||||
typedef typename Device::execution_space execution_space;
|
||||
typedef Hasher hasher_type;
|
||||
typedef EqualTo equal_to_type;
|
||||
typedef uint32_t size_type;
|
||||
|
||||
//map_types
|
||||
typedef UnorderedMap<declared_key_type,declared_value_type,execution_space,hasher_type,equal_to_type> declared_map_type;
|
||||
typedef UnorderedMap<key_type,value_type,execution_space,hasher_type,equal_to_type> insertable_map_type;
|
||||
typedef UnorderedMap<const_key_type,value_type,execution_space,hasher_type,equal_to_type> modifiable_map_type;
|
||||
typedef UnorderedMap<const_key_type,const_value_type,execution_space,hasher_type,equal_to_type> const_map_type;
|
||||
typedef UnorderedMap<declared_key_type,declared_value_type,device_type,hasher_type,equal_to_type> declared_map_type;
|
||||
typedef UnorderedMap<key_type,value_type,device_type,hasher_type,equal_to_type> insertable_map_type;
|
||||
typedef UnorderedMap<const_key_type,value_type,device_type,hasher_type,equal_to_type> modifiable_map_type;
|
||||
typedef UnorderedMap<const_key_type,const_value_type,device_type,hasher_type,equal_to_type> const_map_type;
|
||||
|
||||
static const bool is_set = std::is_same<void,value_type>::value;
|
||||
static const bool has_const_key = std::is_same<const_key_type,declared_key_type>::value;
|
||||
@ -264,18 +265,18 @@ private:
|
||||
typedef typename Impl::if_c< is_set, int, declared_value_type>::type impl_value_type;
|
||||
|
||||
typedef typename Impl::if_c< is_insertable_map
|
||||
, View< key_type *, execution_space>
|
||||
, View< const key_type *, execution_space, MemoryTraits<RandomAccess> >
|
||||
, View< key_type *, device_type>
|
||||
, View< const key_type *, device_type, MemoryTraits<RandomAccess> >
|
||||
>::type key_type_view;
|
||||
|
||||
typedef typename Impl::if_c< is_insertable_map || is_modifiable_map
|
||||
, View< impl_value_type *, execution_space>
|
||||
, View< const impl_value_type *, execution_space, MemoryTraits<RandomAccess> >
|
||||
, View< impl_value_type *, device_type>
|
||||
, View< const impl_value_type *, device_type, MemoryTraits<RandomAccess> >
|
||||
>::type value_type_view;
|
||||
|
||||
typedef typename Impl::if_c< is_insertable_map
|
||||
, View< size_type *, execution_space>
|
||||
, View< const size_type *, execution_space, MemoryTraits<RandomAccess> >
|
||||
, View< size_type *, device_type>
|
||||
, View< const size_type *, device_type, MemoryTraits<RandomAccess> >
|
||||
>::type size_type_view;
|
||||
|
||||
typedef typename Impl::if_c< is_insertable_map
|
||||
@ -285,7 +286,7 @@ private:
|
||||
|
||||
enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 };
|
||||
enum { num_scalars = 3 };
|
||||
typedef View< int[num_scalars], LayoutLeft, execution_space> scalars_view;
|
||||
typedef View< int[num_scalars], LayoutLeft, device_type> scalars_view;
|
||||
|
||||
public:
|
||||
//! \name Public member functions
|
||||
@ -757,7 +758,7 @@ public:
|
||||
|
||||
Kokkos::deep_copy(tmp.m_available_indexes, src.m_available_indexes);
|
||||
|
||||
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, typename SDevice::memory_space > raw_deep_copy;
|
||||
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, typename SDevice::memory_space > raw_deep_copy;
|
||||
|
||||
raw_deep_copy(tmp.m_hash_lists.ptr_on_device(), src.m_hash_lists.ptr_on_device(), sizeof(size_type)*src.m_hash_lists.dimension_0());
|
||||
raw_deep_copy(tmp.m_next_index.ptr_on_device(), src.m_next_index.ptr_on_device(), sizeof(size_type)*src.m_next_index.dimension_0());
|
||||
@ -781,21 +782,21 @@ private: // private member functions
|
||||
|
||||
void set_flag(int flag) const
|
||||
{
|
||||
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
|
||||
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
|
||||
const int true_ = true;
|
||||
raw_deep_copy(m_scalars.ptr_on_device() + flag, &true_, sizeof(int));
|
||||
}
|
||||
|
||||
void reset_flag(int flag) const
|
||||
{
|
||||
typedef Kokkos::Impl::DeepCopy< typename execution_space::memory_space, Kokkos::HostSpace > raw_deep_copy;
|
||||
typedef Kokkos::Impl::DeepCopy< typename device_type::memory_space, Kokkos::HostSpace > raw_deep_copy;
|
||||
const int false_ = false;
|
||||
raw_deep_copy(m_scalars.ptr_on_device() + flag, &false_, sizeof(int));
|
||||
}
|
||||
|
||||
bool get_flag(int flag) const
|
||||
{
|
||||
typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename execution_space::memory_space > raw_deep_copy;
|
||||
typedef Kokkos::Impl::DeepCopy< Kokkos::HostSpace, typename device_type::memory_space > raw_deep_copy;
|
||||
int result = false;
|
||||
raw_deep_copy(&result, m_scalars.ptr_on_device() + flag, sizeof(int));
|
||||
return result;
|
||||
|
||||
@ -3,38 +3,49 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR})
|
||||
INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src )
|
||||
|
||||
SET(SOURCES
|
||||
UnitTestMain.cpp
|
||||
TestCuda.cpp
|
||||
)
|
||||
|
||||
SET(LIBRARIES kokkoscore)
|
||||
|
||||
IF(Kokkos_ENABLE_Pthread)
|
||||
LIST( APPEND SOURCES
|
||||
TestThreads.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Serial)
|
||||
LIST( APPEND SOURCES
|
||||
TestSerial.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_OpenMP)
|
||||
LIST( APPEND SOURCES
|
||||
TestOpenMP.cpp
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest
|
||||
SOURCES ${SOURCES}
|
||||
UnitTest_Threads
|
||||
SOURCES TestThreads.cpp UnitTestMain.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Serial)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_Serial
|
||||
SOURCES TestSerial.cpp UnitTestMain.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_OpenMP)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_OpenMP
|
||||
SOURCES TestOpenMP.cpp UnitTestMain.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_Cuda)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_Cuda
|
||||
SOURCES TestCuda.cpp UnitTestMain.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
|
||||
@ -64,6 +64,7 @@ struct TestDynamicView
|
||||
typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
|
||||
|
||||
typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
|
||||
typedef typename view_type::const_type const_view_type ;
|
||||
|
||||
typedef typename Kokkos::TeamPolicy<execution_space>::member_type member_type ;
|
||||
typedef double value_type;
|
||||
@ -136,6 +137,8 @@ struct TestDynamicView
|
||||
|
||||
view_type da("A",pool,arg_total_size);
|
||||
|
||||
const_view_type ca(da);
|
||||
|
||||
// printf("TestDynamicView::run(%d) construct test functor\n",arg_total_size);
|
||||
|
||||
TestDynamicView functor(da,arg_total_size);
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
TRIBITS_PACKAGE_DEFINE_DEPENDENCIES(
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREAD DLlib
|
||||
LIB_OPTIONAL_TPLS Pthread CUDA HWLOC QTHREADS DLlib
|
||||
TEST_OPTIONAL_TPLS CUSPARSE
|
||||
)
|
||||
|
||||
|
||||
@ -30,7 +30,7 @@
|
||||
|
||||
#cmakedefine KOKKOS_HAVE_PTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_SERIAL
|
||||
#cmakedefine KOKKOS_HAVE_QTHREAD
|
||||
#cmakedefine KOKKOS_HAVE_QTHREADS
|
||||
#cmakedefine KOKKOS_HAVE_Winthread
|
||||
#cmakedefine KOKKOS_HAVE_OPENMP
|
||||
#cmakedefine KOKKOS_HAVE_HWLOC
|
||||
|
||||
@ -60,4 +60,3 @@ clean: kokkos-clean
|
||||
|
||||
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
|
||||
|
||||
|
||||
@ -52,6 +52,8 @@
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
#include <PerfTestMDRange.hpp>
|
||||
|
||||
#include <PerfTestHexGrad.hpp>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
#include <PerfTestGramSchmidt.hpp>
|
||||
@ -72,6 +74,14 @@ class cuda : public ::testing::Test {
|
||||
}
|
||||
};
|
||||
|
||||
//TEST_F( cuda, mdrange_lr ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutRight>( 5, 8, "Kokkos::Cuda" )) );
|
||||
//}
|
||||
|
||||
//TEST_F( cuda, mdrange_ll ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<Kokkos::Cuda , Kokkos::LayoutLeft>( 5, 8, "Kokkos::Cuda" )) );
|
||||
//}
|
||||
|
||||
TEST_F( cuda, hexgrad )
|
||||
{
|
||||
EXPECT_NO_THROW( run_test_hexgrad< Kokkos::Cuda >( 10 , 20, "Kokkos::Cuda" ) );
|
||||
|
||||
@ -60,6 +60,342 @@ namespace Test {
|
||||
|
||||
enum { NUMBER_OF_TRIALS = 5 };
|
||||
|
||||
template< class DeviceType , class LayoutType >
|
||||
void run_test_mdrange( int exp_beg , int exp_end, const char deviceTypeName[], int range_offset = 0, int tile_offset = 0 )
|
||||
// exp_beg = 6 => 2^6 = 64 is starting range length
|
||||
{
|
||||
#define MDRANGE_PERFORMANCE_OUTPUT_VERBOSE 0
|
||||
|
||||
std::string label_mdrange ;
|
||||
label_mdrange.append( "\"MDRange< double , " );
|
||||
label_mdrange.append( deviceTypeName );
|
||||
label_mdrange.append( " >\"" );
|
||||
|
||||
std::string label_range_col2 ;
|
||||
label_range_col2.append( "\"RangeColTwo< double , " );
|
||||
label_range_col2.append( deviceTypeName );
|
||||
label_range_col2.append( " >\"" );
|
||||
|
||||
std::string label_range_col_all ;
|
||||
label_range_col_all.append( "\"RangeColAll< double , " );
|
||||
label_range_col_all.append( deviceTypeName );
|
||||
label_range_col_all.append( " >\"" );
|
||||
|
||||
if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value) {
|
||||
std::cout << "--------------------------------------------------------------\n"
|
||||
<< "Performance tests for MDRange Layout Right"
|
||||
<< "\n--------------------------------------------------------------" << std::endl;
|
||||
} else {
|
||||
std::cout << "--------------------------------------------------------------\n"
|
||||
<< "Performance tests for MDRange Layout Left"
|
||||
<< "\n--------------------------------------------------------------" << std::endl;
|
||||
}
|
||||
|
||||
|
||||
for (int i = exp_beg ; i < exp_end ; ++i) {
|
||||
const int range_length = (1<<i) + range_offset;
|
||||
|
||||
std::cout << "\n--------------------------------------------------------------\n"
|
||||
<< "--------------------------------------------------------------\n"
|
||||
<< "MDRange Test: range bounds: " << range_length << " , " << range_length << " , " << range_length
|
||||
<< "\n--------------------------------------------------------------\n"
|
||||
<< "--------------------------------------------------------------\n";
|
||||
// << std::endl;
|
||||
|
||||
int t0_min = 0, t1_min = 0, t2_min = 0;
|
||||
double seconds_min = 0.0;
|
||||
|
||||
// Test 1: The MDRange in full
|
||||
{
|
||||
int t0 = 1, t1 = 1, t2 = 1;
|
||||
int counter = 1;
|
||||
#if !defined(KOKKOS_HAVE_CUDA)
|
||||
int min_bnd = 8;
|
||||
int tfast = range_length;
|
||||
#else
|
||||
int min_bnd = 2;
|
||||
int tfast = 32;
|
||||
#endif
|
||||
while ( tfast >= min_bnd ) {
|
||||
int tmid = min_bnd;
|
||||
while ( tmid < tfast ) {
|
||||
t0 = min_bnd;
|
||||
t1 = tmid;
|
||||
t2 = tfast;
|
||||
int t2_rev = min_bnd;
|
||||
int t1_rev = tmid;
|
||||
int t0_rev = tfast;
|
||||
|
||||
#if defined(KOKKOS_HAVE_CUDA)
|
||||
//Note: Product of tile sizes must be < 1024 for Cuda
|
||||
if ( t0*t1*t2 >= 1024 ) {
|
||||
printf(" Exceeded Cuda tile limits; onto next range set\n\n");
|
||||
break;
|
||||
}
|
||||
#endif
|
||||
|
||||
// Run 1 with tiles LayoutRight style
|
||||
double seconds_1 = 0;
|
||||
{ seconds_1 = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0, t1, t2) ; }
|
||||
|
||||
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
|
||||
std::cout << label_mdrange
|
||||
<< " , " << t0 << " , " << t1 << " , " << t2
|
||||
<< " , " << seconds_1
|
||||
<< std::endl ;
|
||||
#endif
|
||||
|
||||
if ( counter == 1 ) {
|
||||
seconds_min = seconds_1;
|
||||
t0_min = t0;
|
||||
t1_min = t1;
|
||||
t2_min = t2;
|
||||
}
|
||||
else {
|
||||
if ( seconds_1 < seconds_min )
|
||||
{
|
||||
seconds_min = seconds_1;
|
||||
t0_min = t0;
|
||||
t1_min = t1;
|
||||
t2_min = t2;
|
||||
}
|
||||
}
|
||||
|
||||
// Run 2 with tiles LayoutLeft style - reverse order of tile dims
|
||||
double seconds_1rev = 0;
|
||||
{ seconds_1rev = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, t0_rev, t1_rev, t2_rev) ; }
|
||||
|
||||
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
|
||||
std::cout << label_mdrange
|
||||
<< " , " << t0_rev << " , " << t1_rev << " , " << t2_rev
|
||||
<< " , " << seconds_1rev
|
||||
<< std::endl ;
|
||||
#endif
|
||||
|
||||
if ( seconds_1rev < seconds_min )
|
||||
{
|
||||
seconds_min = seconds_1rev;
|
||||
t0_min = t0_rev;
|
||||
t1_min = t1_rev;
|
||||
t2_min = t2_rev;
|
||||
}
|
||||
|
||||
++counter;
|
||||
tmid <<= 1;
|
||||
} //end inner while
|
||||
tfast >>=1;
|
||||
} //end outer while
|
||||
|
||||
std::cout << "\n"
|
||||
<< "--------------------------------------------------------------\n"
|
||||
<< label_mdrange
|
||||
<< "\n Min values "
|
||||
<< "\n Range length per dim (3D): " << range_length
|
||||
<< "\n TileDims: " << t0_min << " , " << t1_min << " , " << t2_min
|
||||
<< "\n Min time: " << seconds_min
|
||||
<< "\n---------------------------------------------------------------"
|
||||
<< std::endl ;
|
||||
} //end scope
|
||||
|
||||
#if !defined(KOKKOS_HAVE_CUDA)
|
||||
double seconds_min_c = 0.0;
|
||||
int t0c_min = 0, t1c_min = 0, t2c_min = 0;
|
||||
int counter = 1;
|
||||
{
|
||||
int min_bnd = 8;
|
||||
// Test 1_c: MDRange with 0 for 'inner' tile dim; this case will utilize the full span in that direction, should be similar to Collapse<2>
|
||||
if ( std::is_same<LayoutType, Kokkos::LayoutRight>::value ) {
|
||||
for ( unsigned int T0 = min_bnd; T0 < static_cast<unsigned int>(range_length); T0<<=1 ) {
|
||||
for ( unsigned int T1 = min_bnd; T1 < static_cast<unsigned int>(range_length); T1<<=1 ) {
|
||||
double seconds_c = 0;
|
||||
{ seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, T0, T1, 0) ; }
|
||||
|
||||
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
|
||||
std::cout << " MDRange LR with '0' tile - collapse-like \n"
|
||||
<< label_mdrange
|
||||
<< " , " << T0 << " , " << T1 << " , " << range_length
|
||||
<< " , " << seconds_c
|
||||
<< std::endl ;
|
||||
#endif
|
||||
|
||||
t2c_min = range_length;
|
||||
if ( counter == 1 ) {
|
||||
seconds_min_c = seconds_c;
|
||||
t0c_min = T0;
|
||||
t1c_min = T1;
|
||||
}
|
||||
else {
|
||||
if ( seconds_c < seconds_min_c )
|
||||
{
|
||||
seconds_min_c = seconds_c;
|
||||
t0c_min = T0;
|
||||
t1c_min = T1;
|
||||
}
|
||||
}
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
}
|
||||
else {
|
||||
for ( unsigned int T1 = min_bnd; T1 <= static_cast<unsigned int>(range_length); T1<<=1 ) {
|
||||
for ( unsigned int T2 = min_bnd; T2 <= static_cast<unsigned int>(range_length); T2<<=1 ) {
|
||||
double seconds_c = 0;
|
||||
{ seconds_c = MultiDimRangePerf3D< DeviceType , double , LayoutType >::test_multi_index(range_length,range_length,range_length, 0, T1, T2) ; }
|
||||
|
||||
#if MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
|
||||
std::cout << " MDRange LL with '0' tile - collapse-like \n"
|
||||
<< label_mdrange
|
||||
<< " , " <<range_length << " < " << T1 << " , " << T2
|
||||
<< " , " << seconds_c
|
||||
<< std::endl ;
|
||||
#endif
|
||||
|
||||
|
||||
t0c_min = range_length;
|
||||
if ( counter == 1 ) {
|
||||
seconds_min_c = seconds_c;
|
||||
t1c_min = T1;
|
||||
t2c_min = T2;
|
||||
}
|
||||
else {
|
||||
if ( seconds_c < seconds_min_c )
|
||||
{
|
||||
seconds_min_c = seconds_c;
|
||||
t1c_min = T1;
|
||||
t2c_min = T2;
|
||||
}
|
||||
}
|
||||
++counter;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
std::cout
|
||||
// << "--------------------------------------------------------------\n"
|
||||
<< label_mdrange
|
||||
<< " Collapse<2> style: "
|
||||
<< "\n Min values "
|
||||
<< "\n Range length per dim (3D): " << range_length
|
||||
<< "\n TileDims: " << t0c_min << " , " << t1c_min << " , " << t2c_min
|
||||
<< "\n Min time: " << seconds_min_c
|
||||
<< "\n---------------------------------------------------------------"
|
||||
<< std::endl ;
|
||||
} //end scope test 2
|
||||
#endif
|
||||
|
||||
|
||||
// Test 2: RangePolicy Collapse2 style
|
||||
double seconds_2 = 0;
|
||||
{ seconds_2 = RangePolicyCollapseTwo< DeviceType , double , LayoutType >::test_index_collapse_two(range_length,range_length,range_length) ; }
|
||||
std::cout << label_range_col2
|
||||
<< " , " << range_length
|
||||
<< " , " << seconds_2
|
||||
<< std::endl ;
|
||||
|
||||
|
||||
// Test 3: RangePolicy Collapse all style - not necessary, always slow
|
||||
/*
|
||||
double seconds_3 = 0;
|
||||
{ seconds_3 = RangePolicyCollapseAll< DeviceType , double , LayoutType >::test_collapse_all(range_length,range_length,range_length) ; }
|
||||
std::cout << label_range_col_all
|
||||
<< " , " << range_length
|
||||
<< " , " << seconds_3
|
||||
<< "\n---------------------------------------------------------------"
|
||||
<< std::endl ;
|
||||
*/
|
||||
|
||||
// Compare fastest times... will never be collapse all so ignore it
|
||||
// seconds_min = tiled MDRange
|
||||
// seconds_min_c = collapse<2>-like MDRange (tiledim = span for fast dim) - only for non-Cuda, else tile too long
|
||||
// seconds_2 = collapse<2>-style RangePolicy
|
||||
// seconds_3 = collapse<3>-style RangePolicy
|
||||
|
||||
#if !defined(KOKKOS_HAVE_CUDA)
|
||||
if ( seconds_min < seconds_min_c ) {
|
||||
if ( seconds_min < seconds_2 ) {
|
||||
std::cout << "--------------------------------------------------------------\n"
|
||||
<< " Fastest run: MDRange tiled\n"
|
||||
<< " Time: " << seconds_min
|
||||
<< " Difference: " << seconds_2 - seconds_min
|
||||
<< " Other times: \n"
|
||||
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
|
||||
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
else if ( seconds_min > seconds_2 ) {
|
||||
std::cout << " Fastest run: Collapse2 RangePolicy\n"
|
||||
<< " Time: " << seconds_2
|
||||
<< " Difference: " << seconds_min - seconds_2
|
||||
<< " Other times: \n"
|
||||
<< " MDrange Tiled: " << seconds_min << "\n"
|
||||
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
}
|
||||
else if ( seconds_min > seconds_min_c ) {
|
||||
if ( seconds_min_c < seconds_2 ) {
|
||||
std::cout << "--------------------------------------------------------------\n"
|
||||
<< " Fastest run: MDRange collapse-like (tiledim = span on fast dim) type\n"
|
||||
<< " Time: " << seconds_min_c
|
||||
<< " Difference: " << seconds_2 - seconds_min_c
|
||||
<< " Other times: \n"
|
||||
<< " MDrange Tiled: " << seconds_min << "\n"
|
||||
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
else if ( seconds_min_c > seconds_2 ) {
|
||||
std::cout << " Fastest run: Collapse2 RangePolicy\n"
|
||||
<< " Time: " << seconds_2
|
||||
<< " Difference: " << seconds_min_c - seconds_2
|
||||
<< " Other times: \n"
|
||||
<< " MDrange Tiled: " << seconds_min << "\n"
|
||||
<< " MDrange collapse-like (tiledim = span on fast dim) type: " << seconds_min_c << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
} // end else if
|
||||
#else
|
||||
if ( seconds_min < seconds_2 ) {
|
||||
std::cout << "--------------------------------------------------------------\n"
|
||||
<< " Fastest run: MDRange tiled\n"
|
||||
<< " Time: " << seconds_min
|
||||
<< " Difference: " << seconds_2 - seconds_min
|
||||
<< " Other times: \n"
|
||||
<< " Collapse2 Range Policy: " << seconds_2 << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
else if ( seconds_min > seconds_2 ) {
|
||||
std::cout << " Fastest run: Collapse2 RangePolicy\n"
|
||||
<< " Time: " << seconds_2
|
||||
<< " Difference: " << seconds_min - seconds_2
|
||||
<< " Other times: \n"
|
||||
<< " MDrange Tiled: " << seconds_min << "\n"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
<< "\n--------------------------------------------------------------"
|
||||
//<< "\n\n"
|
||||
<< std::endl;
|
||||
}
|
||||
#endif
|
||||
|
||||
} //end for
|
||||
|
||||
#undef MDRANGE_PERFORMANCE_OUTPUT_VERBOSE
|
||||
|
||||
}
|
||||
|
||||
|
||||
template< class DeviceType >
|
||||
|
||||
@ -66,6 +66,8 @@ const char TestHostDeviceName[] = "Kokkos::Serial" ;
|
||||
|
||||
#include <impl/Kokkos_Timer.hpp>
|
||||
|
||||
#include <PerfTestMDRange.hpp>
|
||||
|
||||
#include <PerfTestHexGrad.hpp>
|
||||
#include <PerfTestBlasKernels.hpp>
|
||||
#include <PerfTestGramSchmidt.hpp>
|
||||
@ -102,6 +104,14 @@ protected:
|
||||
}
|
||||
};
|
||||
|
||||
//TEST_F( host, mdrange_lr ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutRight> (5, 8, TestHostDeviceName) ) );
|
||||
//}
|
||||
|
||||
//TEST_F( host, mdrange_ll ) {
|
||||
// EXPECT_NO_THROW( (run_test_mdrange<TestHostDevice , Kokkos::LayoutLeft> (5, 8, TestHostDeviceName) ) );
|
||||
//}
|
||||
|
||||
TEST_F( host, hexgrad ) {
|
||||
EXPECT_NO_THROW(run_test_hexgrad< TestHostDevice>( 10, 20, TestHostDeviceName ));
|
||||
}
|
||||
|
||||
564
lib/kokkos/core/perf_test/PerfTestMDRange.hpp
Normal file
564
lib/kokkos/core/perf_test/PerfTestMDRange.hpp
Normal file
@ -0,0 +1,564 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
namespace Test {
|
||||
template< class DeviceType
|
||||
, typename ScalarType = double
|
||||
, typename TestLayout = Kokkos::LayoutRight
|
||||
>
|
||||
struct MultiDimRangePerf3D
|
||||
{
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
|
||||
using iterate_type = Kokkos::Experimental::Iterate;
|
||||
|
||||
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
|
||||
MultiDimRangePerf3D(const view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long i, const long j, const long k) const
|
||||
{
|
||||
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
|
||||
+ B(i,j+2,k) + B(i,j+1,k)
|
||||
+ B(i,j,k+2) + B(i,j,k+1)
|
||||
+ B(i,j,k) );
|
||||
}
|
||||
|
||||
|
||||
struct InitZeroTag {};
|
||||
// struct InitViewTag {};
|
||||
|
||||
struct Init
|
||||
{
|
||||
|
||||
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long i, const long j, const long k) const
|
||||
{
|
||||
input(i,j,k) = 1.0;
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const InitZeroTag&, const long i, const long j, const long k) const
|
||||
{
|
||||
input(i,j,k) = 0;
|
||||
}
|
||||
|
||||
view_type input;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
};
|
||||
|
||||
|
||||
static double test_multi_index(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const unsigned int Ti = 1, const unsigned int Tj = 1, const unsigned int Tk = 1, const long iter = 1)
|
||||
{
|
||||
//This test performs multidim range over all dims
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
|
||||
typedef MultiDimRangePerf3D<execution_space,ScalarType,TestLayout> FunctorType;
|
||||
|
||||
double dt_min = 0;
|
||||
|
||||
// LayoutRight
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}});
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}});
|
||||
|
||||
typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > MDRangeType;
|
||||
using tile_type = typename MDRangeType::tile_type;
|
||||
using point_type = typename MDRangeType::point_type;
|
||||
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Right, iterate_type::Right>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
|
||||
|
||||
Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
|
||||
execution_space::fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
||||
//Correctness check - only the first run
|
||||
if ( 0 == i )
|
||||
{
|
||||
long numErrors = 0;
|
||||
host_view_type Ahost("Ahost", icount, jcount, kcount);
|
||||
Kokkos::deep_copy(Ahost, Atest);
|
||||
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
|
||||
Kokkos::deep_copy(Bhost, Btest);
|
||||
|
||||
// On KNL, this may vectorize - add print statement to prevent
|
||||
// Also, compare against epsilon, as vectorization can change bitwise answer
|
||||
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
|
||||
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
|
||||
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
|
||||
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
|
||||
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
|
||||
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
|
||||
+ Bhost(l,j,k) );
|
||||
if ( Ahost(l,j,k) - check != 0 ) {
|
||||
++numErrors;
|
||||
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
|
||||
<< " multi Ahost = " << Ahost(l,j,k) << " expected = " << check
|
||||
<< " multi Bhost(ijk) = " << Bhost(l,j,k)
|
||||
<< " multi Bhost(l+1jk) = " << Bhost(l+1,j,k)
|
||||
<< " multi Bhost(l+2jk) = " << Bhost(l+2,j,k)
|
||||
<< " multi Bhost(ij+1k) = " << Bhost(l,j+1,k)
|
||||
<< " multi Bhost(ij+2k) = " << Bhost(l,j+2,k)
|
||||
<< " multi Bhost(ijk+1) = " << Bhost(l,j,k+1)
|
||||
<< " multi Bhost(ijk+2) = " << Bhost(l,j,k+2)
|
||||
<< std::endl;
|
||||
//exit(-1);
|
||||
}
|
||||
} } }
|
||||
if ( numErrors != 0 ) { std::cout << "LR multi: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
|
||||
//else { std::cout << " multi: No errors!" << std::endl; }
|
||||
}
|
||||
} //end for
|
||||
|
||||
}
|
||||
// LayoutLeft
|
||||
else {
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initA({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}});
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3,iterate_type::Left,iterate_type::Left>, execution_space > policy_initB({{0,0,0}},{{icount+2,jcount+2,kcount+2}},{{Ti,Tj,Tk}});
|
||||
|
||||
//typedef typename Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > MDRangeType;
|
||||
//using tile_type = typename MDRangeType::tile_type;
|
||||
//using point_type = typename MDRangeType::point_type;
|
||||
//Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy(point_type{{0,0,0}},point_type{{icount,jcount,kcount}},tile_type{{Ti,Tj,Tk}} );
|
||||
Kokkos::Experimental::MDRangePolicy<Kokkos::Experimental::Rank<3, iterate_type::Left, iterate_type::Left>, execution_space > policy({{0,0,0}},{{icount,jcount,kcount}},{{Ti,Tj,Tk}} );
|
||||
|
||||
Kokkos::Experimental::md_parallel_for( policy_initA, Init(Atest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
Kokkos::Experimental::md_parallel_for( policy_initB, Init(Btest, icount+2, jcount+2, kcount+2) );
|
||||
execution_space::fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::Experimental::md_parallel_for( policy, FunctorType(Atest, Btest, icount, jcount, kcount) );
|
||||
execution_space::fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
||||
//Correctness check - only the first run
|
||||
if ( 0 == i )
|
||||
{
|
||||
long numErrors = 0;
|
||||
host_view_type Ahost("Ahost", icount, jcount, kcount);
|
||||
Kokkos::deep_copy(Ahost, Atest);
|
||||
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
|
||||
Kokkos::deep_copy(Bhost, Btest);
|
||||
|
||||
// On KNL, this may vectorize - add print statement to prevent
|
||||
// Also, compare against epsilon, as vectorization can change bitwise answer
|
||||
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
|
||||
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
|
||||
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
|
||||
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
|
||||
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
|
||||
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
|
||||
+ Bhost(l,j,k) );
|
||||
if ( Ahost(l,j,k) - check != 0 ) {
|
||||
++numErrors;
|
||||
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
|
||||
<< " multi Ahost = " << Ahost(l,j,k) << " expected = " << check
|
||||
<< " multi Bhost(ijk) = " << Bhost(l,j,k)
|
||||
<< " multi Bhost(l+1jk) = " << Bhost(l+1,j,k)
|
||||
<< " multi Bhost(l+2jk) = " << Bhost(l+2,j,k)
|
||||
<< " multi Bhost(ij+1k) = " << Bhost(l,j+1,k)
|
||||
<< " multi Bhost(ij+2k) = " << Bhost(l,j+2,k)
|
||||
<< " multi Bhost(ijk+1) = " << Bhost(l,j,k+1)
|
||||
<< " multi Bhost(ijk+2) = " << Bhost(l,j,k+2)
|
||||
<< std::endl;
|
||||
//exit(-1);
|
||||
}
|
||||
} } }
|
||||
if ( numErrors != 0 ) { std::cout << " LL multi run: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
|
||||
//else { std::cout << " multi: No errors!" << std::endl; }
|
||||
|
||||
}
|
||||
} //end for
|
||||
}
|
||||
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
template< class DeviceType
|
||||
, typename ScalarType = double
|
||||
, typename TestLayout = Kokkos::LayoutRight
|
||||
>
|
||||
struct RangePolicyCollapseTwo
|
||||
{
|
||||
// RangePolicy for 3D range, but will collapse only 2 dims => like Rank<2> for multi-dim; unroll 2 dims in one-dim
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef TestLayout layout;
|
||||
|
||||
using iterate_type = Kokkos::Experimental::Iterate;
|
||||
|
||||
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
|
||||
RangePolicyCollapseTwo(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: A(A_), B(B_) , irange(irange_), jrange(jrange_), krange(krange_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long r) const
|
||||
{
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
|
||||
{
|
||||
//id(i,j,k) = k + j*Nk + i*Nk*Nj = k + Nk*(j + i*Nj) = k + Nk*r
|
||||
//r = j + i*Nj
|
||||
long i = int(r / jrange);
|
||||
long j = int( r - i*jrange);
|
||||
for (int k = 0; k < krange; ++k) {
|
||||
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
|
||||
+ B(i,j+2,k) + B(i,j+1,k)
|
||||
+ B(i,j,k+2) + B(i,j,k+1)
|
||||
+ B(i,j,k) );
|
||||
}
|
||||
}
|
||||
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
|
||||
{
|
||||
//id(i,j,k) = i + j*Ni + k*Ni*Nj = i + Ni*(j + k*Nj) = i + Ni*r
|
||||
//r = j + k*Nj
|
||||
long k = int(r / jrange);
|
||||
long j = int( r - k*jrange);
|
||||
for (int i = 0; i < irange; ++i) {
|
||||
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
|
||||
+ B(i,j+2,k) + B(i,j+1,k)
|
||||
+ B(i,j,k+2) + B(i,j,k+1)
|
||||
+ B(i,j,k) );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct Init
|
||||
{
|
||||
view_type input;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
|
||||
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long r) const
|
||||
{
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
|
||||
{
|
||||
long i = int(r / jrange);
|
||||
long j = int( r - i*jrange);
|
||||
for (int k = 0; k < krange; ++k) {
|
||||
input(i,j,k) = 1;
|
||||
}
|
||||
}
|
||||
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
|
||||
{
|
||||
long k = int(r / jrange);
|
||||
long j = int( r - k*jrange);
|
||||
for (int i = 0; i < irange; ++i) {
|
||||
input(i,j,k) = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static double test_index_collapse_two(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
|
||||
{
|
||||
// This test refers to collapsing two dims while using the RangePolicy
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
|
||||
typedef RangePolicyCollapseTwo<execution_space,ScalarType,TestLayout> FunctorType;
|
||||
|
||||
long collapse_index_rangeA = 0;
|
||||
long collapse_index_rangeB = 0;
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value ) {
|
||||
collapse_index_rangeA = icount*jcount;
|
||||
collapse_index_rangeB = (icount+2)*(jcount+2);
|
||||
// std::cout << " LayoutRight " << std::endl;
|
||||
} else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value ) {
|
||||
collapse_index_rangeA = kcount*jcount;
|
||||
collapse_index_rangeB = (kcount+2)*(jcount+2);
|
||||
// std::cout << " LayoutLeft " << std::endl;
|
||||
} else {
|
||||
std::cout << " LayoutRight or LayoutLeft required - will pass 0 as range instead " << std::endl;
|
||||
exit(-1);
|
||||
}
|
||||
|
||||
Kokkos::RangePolicy<execution_space> policy(0, (collapse_index_rangeA) );
|
||||
Kokkos::RangePolicy<execution_space> policy_initB(0, (collapse_index_rangeB) );
|
||||
|
||||
double dt_min = 0;
|
||||
|
||||
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
|
||||
execution_space::fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
|
||||
execution_space::fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
|
||||
execution_space::fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
||||
//Correctness check - first iteration only
|
||||
if ( 0 == i )
|
||||
{
|
||||
long numErrors = 0;
|
||||
host_view_type Ahost("Ahost", icount, jcount, kcount);
|
||||
Kokkos::deep_copy(Ahost, Atest);
|
||||
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
|
||||
Kokkos::deep_copy(Bhost, Btest);
|
||||
|
||||
// On KNL, this may vectorize - add print statement to prevent
|
||||
// Also, compare against epsilon, as vectorization can change bitwise answer
|
||||
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
|
||||
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
|
||||
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
|
||||
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
|
||||
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
|
||||
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
|
||||
+ Bhost(l,j,k) );
|
||||
if ( Ahost(l,j,k) - check != 0 ) {
|
||||
++numErrors;
|
||||
std::cout << " Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
|
||||
<< " flat Ahost = " << Ahost(l,j,k) << " expected = " << check << std::endl;
|
||||
//exit(-1);
|
||||
}
|
||||
} } }
|
||||
if ( numErrors != 0 ) { std::cout << " RP collapse2: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
|
||||
//else { std::cout << " RP collapse2: Pass! " << std::endl; }
|
||||
}
|
||||
}
|
||||
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
|
||||
template< class DeviceType
|
||||
, typename ScalarType = double
|
||||
, typename TestLayout = Kokkos::LayoutRight
|
||||
>
|
||||
struct RangePolicyCollapseAll
|
||||
{
|
||||
// RangePolicy for 3D range, but will collapse all dims
|
||||
|
||||
typedef DeviceType execution_space;
|
||||
typedef typename execution_space::size_type size_type;
|
||||
typedef TestLayout layout;
|
||||
|
||||
typedef Kokkos::View<ScalarType***, TestLayout, DeviceType> view_type;
|
||||
typedef typename view_type::HostMirror host_view_type;
|
||||
|
||||
view_type A;
|
||||
view_type B;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
|
||||
RangePolicyCollapseAll(view_type & A_, const view_type & B_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: A(A_), B(B_), irange(irange_), jrange(jrange_), krange(krange_)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long r) const
|
||||
{
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
|
||||
{
|
||||
long i = int(r / (jrange*krange));
|
||||
long j = int(( r - i*jrange*krange)/krange);
|
||||
long k = int(r - i*jrange*krange - j*krange);
|
||||
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
|
||||
+ B(i,j+2,k) + B(i,j+1,k)
|
||||
+ B(i,j,k+2) + B(i,j,k+1)
|
||||
+ B(i,j,k) );
|
||||
}
|
||||
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
|
||||
{
|
||||
long k = int(r / (irange*jrange));
|
||||
long j = int(( r - k*irange*jrange)/irange);
|
||||
long i = int(r - k*irange*jrange - j*irange);
|
||||
A(i,j,k) = 0.25*(ScalarType)( B(i+2,j,k) + B(i+1,j,k)
|
||||
+ B(i,j+2,k) + B(i,j+1,k)
|
||||
+ B(i,j,k+2) + B(i,j,k+1)
|
||||
+ B(i,j,k) );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
struct Init
|
||||
{
|
||||
view_type input;
|
||||
const long irange;
|
||||
const long jrange;
|
||||
const long krange;
|
||||
|
||||
Init(const view_type & input_, const long &irange_, const long &jrange_, const long &krange_)
|
||||
: input(input_), irange(irange_), jrange(jrange_), krange(krange_) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void operator()(const long r) const
|
||||
{
|
||||
if ( std::is_same<TestLayout, Kokkos::LayoutRight>::value )
|
||||
{
|
||||
long i = int(r / (jrange*krange));
|
||||
long j = int(( r - i*jrange*krange)/krange);
|
||||
long k = int(r - i*jrange*krange - j*krange);
|
||||
input(i,j,k) = 1;
|
||||
}
|
||||
else if ( std::is_same<TestLayout, Kokkos::LayoutLeft>::value )
|
||||
{
|
||||
long k = int(r / (irange*jrange));
|
||||
long j = int(( r - k*irange*jrange)/irange);
|
||||
long i = int(r - k*irange*jrange - j*irange);
|
||||
input(i,j,k) = 1;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
static double test_collapse_all(const unsigned int icount, const unsigned int jcount, const unsigned int kcount, const long iter = 1)
|
||||
{
|
||||
//This test refers to collapsing all dims using the RangePolicy
|
||||
view_type Atest("Atest", icount, jcount, kcount);
|
||||
view_type Btest("Btest", icount+2, jcount+2, kcount+2);
|
||||
typedef RangePolicyCollapseAll<execution_space,ScalarType,TestLayout> FunctorType;
|
||||
|
||||
const long flat_index_range = icount*jcount*kcount;
|
||||
Kokkos::RangePolicy<execution_space> policy(0, flat_index_range );
|
||||
Kokkos::RangePolicy<execution_space> policy_initB(0, (icount+2)*(jcount+2)*(kcount+2) );
|
||||
|
||||
double dt_min = 0;
|
||||
|
||||
Kokkos::parallel_for( policy, Init(Atest,icount,jcount,kcount) );
|
||||
execution_space::fence();
|
||||
Kokkos::parallel_for( policy_initB, Init(Btest,icount+2,jcount+2,kcount+2) );
|
||||
execution_space::fence();
|
||||
|
||||
for (int i = 0; i < iter; ++i)
|
||||
{
|
||||
Kokkos::Timer timer;
|
||||
Kokkos::parallel_for(policy, FunctorType(Atest, Btest, icount, jcount, kcount));
|
||||
execution_space::fence();
|
||||
const double dt = timer.seconds();
|
||||
if ( 0 == i ) dt_min = dt ;
|
||||
else dt_min = dt < dt_min ? dt : dt_min ;
|
||||
|
||||
//Correctness check - first iteration only
|
||||
if ( 0 == i )
|
||||
{
|
||||
long numErrors = 0;
|
||||
host_view_type Ahost("Ahost", icount, jcount, kcount);
|
||||
Kokkos::deep_copy(Ahost, Atest);
|
||||
host_view_type Bhost("Bhost", icount+2, jcount+2, kcount+2);
|
||||
Kokkos::deep_copy(Bhost, Btest);
|
||||
|
||||
// On KNL, this may vectorize - add print statement to prevent
|
||||
// Also, compare against epsilon, as vectorization can change bitwise answer
|
||||
for ( long l = 0; l < static_cast<long>(icount); ++l ) {
|
||||
for ( long j = 0; j < static_cast<long>(jcount); ++j ) {
|
||||
for ( long k = 0; k < static_cast<long>(kcount); ++k ) {
|
||||
ScalarType check = 0.25*(ScalarType)( Bhost(l+2,j,k) + Bhost(l+1,j,k)
|
||||
+ Bhost(l,j+2,k) + Bhost(l,j+1,k)
|
||||
+ Bhost(l,j,k+2) + Bhost(l,j,k+1)
|
||||
+ Bhost(l,j,k) );
|
||||
if ( Ahost(l,j,k) - check != 0 ) {
|
||||
++numErrors;
|
||||
std::cout << " Callapse ALL Correctness error at index: " << l << ","<<j<<","<<k<<"\n"
|
||||
<< " flat Ahost = " << Ahost(l,j,k) << " expected = " << check << std::endl;
|
||||
//exit(-1);
|
||||
}
|
||||
} } }
|
||||
if ( numErrors != 0 ) { std::cout << " RP collapse all: errors " << numErrors << " range product " << icount*jcount*kcount << " LL " << jcount*kcount << " LR " << icount*jcount << std::endl; }
|
||||
//else { std::cout << " RP collapse all: Pass! " << std::endl; }
|
||||
}
|
||||
}
|
||||
|
||||
return dt_min;
|
||||
}
|
||||
|
||||
};
|
||||
|
||||
} //end namespace Test
|
||||
@ -92,13 +92,13 @@ LIST(APPEND SOURCES ${SOURCES_CUDA} )
|
||||
INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
FILE(GLOB HEADERS_QTHREAD Qthread/*.hpp)
|
||||
FILE(GLOB SOURCES_QTHREAD Qthread/*.cpp)
|
||||
FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp)
|
||||
FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp)
|
||||
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREAD} )
|
||||
LIST(APPEND SOURCES ${SOURCES_QTHREAD} )
|
||||
LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} )
|
||||
LIST(APPEND SOURCES ${SOURCES_QTHREADS} )
|
||||
|
||||
INSTALL(FILES ${HEADERS_QTHREAD} DESTINATION ${TRILINOS_INCDIR}/Qthread/)
|
||||
INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/)
|
||||
|
||||
#-----------------------------------------------------------------------------
|
||||
|
||||
@ -109,5 +109,3 @@ TRIBITS_ADD_LIBRARY(
|
||||
SOURCES ${SOURCES}
|
||||
DEPLIBS
|
||||
)
|
||||
|
||||
|
||||
|
||||
1300
lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
Normal file
1300
lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -131,6 +131,7 @@ namespace Impl {
|
||||
int* atomic;
|
||||
int* scratch;
|
||||
int* threadid;
|
||||
int n;
|
||||
};
|
||||
}
|
||||
}
|
||||
@ -250,6 +251,7 @@ struct CudaParallelLaunch< DriverType , true > {
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
|
||||
@ -292,6 +294,7 @@ struct CudaParallelLaunch< DriverType , false > {
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
|
||||
|
||||
@ -59,7 +59,7 @@
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
@ -375,7 +375,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
|
||||
SharedAllocationHeader header ;
|
||||
@ -395,7 +395,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::
|
||||
SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::fence(); //Make sure I can access the label ...
|
||||
Kokkos::Profiling::deallocateData(
|
||||
@ -412,7 +412,7 @@ SharedAllocationRecord< Kokkos::CudaUVMSpace , void >::
|
||||
SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::CudaHostPinnedSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
@ -442,7 +442,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space
|
||||
, m_tex_obj( 0 )
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
@ -479,7 +479,7 @@ SharedAllocationRecord( const Kokkos::CudaUVMSpace & arg_space
|
||||
, m_tex_obj( 0 )
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
@ -510,7 +510,7 @@ SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace & arg_space
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
@ -883,6 +883,7 @@ void init_lock_arrays_cuda_space() {
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
|
||||
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
|
||||
|
||||
@ -536,6 +536,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
|
||||
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
|
||||
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
|
||||
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
|
||||
locks.n = Kokkos::Cuda::concurrency();
|
||||
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
|
||||
#endif
|
||||
}
|
||||
@ -620,9 +621,9 @@ void CudaInternal::finalize()
|
||||
was_finalized = 1;
|
||||
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
|
||||
|
||||
atomic_lock_array_cuda_space_ptr(false);
|
||||
scratch_lock_array_cuda_space_ptr(false);
|
||||
threadid_lock_array_cuda_space_ptr(false);
|
||||
atomic_lock_array_cuda_space_ptr(true);
|
||||
scratch_lock_array_cuda_space_ptr(true);
|
||||
threadid_lock_array_cuda_space_ptr(true);
|
||||
|
||||
if ( m_stream ) {
|
||||
for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
|
||||
@ -700,7 +701,7 @@ void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
|
||||
{
|
||||
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
@ -739,7 +740,7 @@ void Cuda::finalize()
|
||||
{
|
||||
Impl::CudaInternal::singleton().finalize();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -61,7 +61,7 @@
|
||||
#include <Cuda/Kokkos_Cuda_Internal.hpp>
|
||||
#include <Kokkos_Vectorization.hpp>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
@ -586,13 +586,35 @@ public:
|
||||
void operator()(void) const
|
||||
{
|
||||
// Iterate this block through the league
|
||||
int threadid = 0;
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
}
|
||||
__syncthreads();
|
||||
threadid = base_thread_id;
|
||||
}
|
||||
|
||||
|
||||
for ( int league_rank = blockIdx.x ; league_rank < m_league_size ; league_rank += gridDim.x ) {
|
||||
|
||||
this-> template exec_team< WorkTag >(
|
||||
typename Policy::member_type( kokkos_impl_cuda_shared_memory<void>()
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, m_scratch_ptr[1]
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size ) );
|
||||
@ -946,11 +968,32 @@ public:
|
||||
|
||||
__device__ inline
|
||||
void operator() () const {
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0) );
|
||||
int threadid = 0;
|
||||
if ( m_scratch_size[1]>0 ) {
|
||||
__shared__ int base_thread_id;
|
||||
if (threadIdx.x==0 && threadIdx.y==0 ) {
|
||||
threadid = ((blockIdx.x*blockDim.z + threadIdx.z) * blockDim.x * blockDim.y) % kokkos_impl_cuda_lock_arrays.n;
|
||||
threadid = ((threadid + blockDim.x * blockDim.y-1)/(blockDim.x * blockDim.y)) * blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid-=blockDim.x * blockDim.y;
|
||||
int done = 0;
|
||||
while (!done) {
|
||||
done = (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[threadid],0,1));
|
||||
if(!done) {
|
||||
threadid += blockDim.x * blockDim.y;
|
||||
if(threadid > kokkos_impl_cuda_lock_arrays.n) threadid = 0;
|
||||
}
|
||||
}
|
||||
base_thread_id = threadid;
|
||||
}
|
||||
__syncthreads();
|
||||
threadid = base_thread_id;
|
||||
}
|
||||
|
||||
run(Kokkos::Impl::if_c<UseShflReduction, DummyShflReductionType, DummySHMEMReductionType>::select(1,1.0), threadid );
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummySHMEMReductionType&) const
|
||||
void run(const DummySHMEMReductionType&, const int& threadid) const
|
||||
{
|
||||
const integral_nonzero_constant< size_type , ValueTraits::StaticValueSize / sizeof(size_type) >
|
||||
word_count( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) / sizeof(size_type) );
|
||||
@ -964,7 +1007,7 @@ public:
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, m_scratch_ptr[1]
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size )
|
||||
@ -992,7 +1035,7 @@ public:
|
||||
}
|
||||
|
||||
__device__ inline
|
||||
void run(const DummyShflReductionType&) const
|
||||
void run(const DummyShflReductionType&, const int& threadid) const
|
||||
{
|
||||
value_type value;
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &value);
|
||||
@ -1003,7 +1046,7 @@ public:
|
||||
( Member( kokkos_impl_cuda_shared_memory<char>() + m_team_begin
|
||||
, m_shmem_begin
|
||||
, m_shmem_size
|
||||
, m_scratch_ptr[1]
|
||||
, (void*) ( ((char*)m_scratch_ptr[1]) + threadid/(blockDim.x*blockDim.y) * m_scratch_size[1])
|
||||
, m_scratch_size[1]
|
||||
, league_rank
|
||||
, m_league_size )
|
||||
@ -1128,9 +1171,9 @@ public:
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too much L0 scratch memory"));
|
||||
}
|
||||
|
||||
if ( m_team_size >
|
||||
Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
|
||||
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length()) {
|
||||
if ( unsigned(m_team_size) >
|
||||
unsigned(Kokkos::Impl::cuda_get_max_block_size< ParallelReduce >
|
||||
( arg_functor , arg_policy.vector_length(), arg_policy.team_scratch_size(0),arg_policy.thread_scratch_size(0) ) / arg_policy.vector_length())) {
|
||||
Kokkos::Impl::throw_runtime_exception(std::string("Kokkos::Impl::ParallelReduce< Cuda > requested too large team size."));
|
||||
}
|
||||
|
||||
@ -1621,14 +1664,25 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Cuda
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
* Calls lambda(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed using +=
|
||||
* and output into result.
|
||||
*
|
||||
* The identity value for the += operator is assumed to be the default
|
||||
* constructed value.
|
||||
*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
void parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
const & loop_boundaries
|
||||
, Lambda const & lambda
|
||||
, ValueType & result )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
result = ValueType();
|
||||
|
||||
@ -1636,52 +1690,42 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::C
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
if (loop_boundaries.increment > 1)
|
||||
result += shfl_down(result, 1,loop_boundaries.increment);
|
||||
if (loop_boundaries.increment > 2)
|
||||
result += shfl_down(result, 2,loop_boundaries.increment);
|
||||
if (loop_boundaries.increment > 4)
|
||||
result += shfl_down(result, 4,loop_boundaries.increment);
|
||||
if (loop_boundaries.increment > 8)
|
||||
result += shfl_down(result, 8,loop_boundaries.increment);
|
||||
if (loop_boundaries.increment > 16)
|
||||
result += shfl_down(result, 16,loop_boundaries.increment);
|
||||
Impl::cuda_intra_warp_vector_reduce(
|
||||
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & result ) );
|
||||
|
||||
result = shfl(result,0,loop_boundaries.increment);
|
||||
#endif
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
/** \brief Intra-thread vector parallel_reduce.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
* Calls lambda(iType i, ValueType & val) for each i=[0..N).
|
||||
*
|
||||
* The range [0..N) is mapped to all vector lanes of
|
||||
* the calling thread and a reduction of val is performed
|
||||
* using JoinType::operator()(ValueType& val, const ValueType& update)
|
||||
* and output into result.
|
||||
*
|
||||
* The input value of result must be the identity value for the
|
||||
* reduction operation; e.g., ( 0 , += ) or ( 1 , *= ).
|
||||
*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
void parallel_reduce
|
||||
( Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::CudaTeamMember >
|
||||
const & loop_boundaries
|
||||
, Lambda const & lambda
|
||||
, JoinType const & join
|
||||
, ValueType & result )
|
||||
{
|
||||
#ifdef __CUDA_ARCH__
|
||||
ValueType result = init_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,result);
|
||||
}
|
||||
|
||||
if (loop_boundaries.increment > 1)
|
||||
join( result, shfl_down(result, 1,loop_boundaries.increment));
|
||||
if (loop_boundaries.increment > 2)
|
||||
join( result, shfl_down(result, 2,loop_boundaries.increment));
|
||||
if (loop_boundaries.increment > 4)
|
||||
join( result, shfl_down(result, 4,loop_boundaries.increment));
|
||||
if (loop_boundaries.increment > 8)
|
||||
join( result, shfl_down(result, 8,loop_boundaries.increment));
|
||||
if (loop_boundaries.increment > 16)
|
||||
join( result, shfl_down(result, 16,loop_boundaries.increment));
|
||||
Impl::cuda_intra_warp_vector_reduce(
|
||||
Impl::Reducer< ValueType , JoinType >( join , & result ) );
|
||||
|
||||
init_result = shfl(result,0,loop_boundaries.increment);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
@ -55,15 +55,163 @@
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Vectorization.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl( T & out , T const & in , int lane ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl( *reinterpret_cast<int const *>(&in) , lane , width );
|
||||
}
|
||||
|
||||
//Shfl based reductions
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl( T & out , T const & in , int lane ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl( reinterpret_cast<int const *>(&in)[i] , lane , width );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl_down( T & out , T const & in , int delta ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl_down( *reinterpret_cast<int const *>(&in) , delta , width );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl_down( T & out , T const & in , int delta ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl_down( reinterpret_cast<int const *>(&in)[i] , delta , width );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl_up( T & out , T const & in , int delta ,
|
||||
typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width )
|
||||
{
|
||||
*reinterpret_cast<int*>(&out) =
|
||||
__shfl_up( *reinterpret_cast<int const *>(&in) , delta , width );
|
||||
}
|
||||
|
||||
template< typename T >
|
||||
__device__ inline
|
||||
void cuda_shfl_up( T & out , T const & in , int delta ,
|
||||
typename std::enable_if
|
||||
< ( sizeof(int) < sizeof(T) ) && ( 0 == ( sizeof(T) % sizeof(int) ) )
|
||||
, int >::type width )
|
||||
{
|
||||
enum : int { N = sizeof(T) / sizeof(int) };
|
||||
|
||||
for ( int i = 0 ; i < N ; ++i ) {
|
||||
reinterpret_cast<int*>(&out)[i] =
|
||||
__shfl_up( reinterpret_cast<int const *>(&in)[i] , delta , width );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Reduce within a warp over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*
|
||||
* Cannot use "butterfly" pattern because floating point
|
||||
* addition is non-associative. Therefore, must broadcast
|
||||
* the final result.
|
||||
*/
|
||||
template< class Reducer >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_reduce( Reducer const & reducer )
|
||||
{
|
||||
static_assert(
|
||||
std::is_reference< typename Reducer::reference_type >::value , "" );
|
||||
|
||||
if ( 1 < blockDim.x ) {
|
||||
|
||||
typename Reducer::value_type tmp ;
|
||||
|
||||
for ( int i = blockDim.x ; ( i >>= 1 ) ; ) {
|
||||
|
||||
cuda_shfl_down( tmp , reducer.reference() , i , blockDim.x );
|
||||
|
||||
if ( threadIdx.x < i ) { reducer.join( reducer.data() , & tmp ); }
|
||||
}
|
||||
|
||||
// Broadcast from root "lane" to all other "lanes"
|
||||
|
||||
cuda_shfl( reducer.reference() , reducer.reference() , 0 , blockDim.x );
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Inclusive scan over blockDim.x, the "vector" dimension.
|
||||
*
|
||||
* This will be called within a nested, intra-team parallel operation.
|
||||
* Use shuffle operations to avoid conflicts with shared memory usage.
|
||||
*
|
||||
* Algorithm is concurrent bottom-up reductions in triangular pattern
|
||||
* where each CUDA thread is the root of a reduction tree from the
|
||||
* zeroth CUDA thread to itself.
|
||||
*
|
||||
* Requires:
|
||||
* blockDim.x is power of 2
|
||||
* blockDim.x <= 32 (one warp)
|
||||
*/
|
||||
template< typename ValueType >
|
||||
__device__ inline
|
||||
void cuda_intra_warp_vector_inclusive_scan( ValueType & local )
|
||||
{
|
||||
ValueType tmp ;
|
||||
|
||||
// Bottom up:
|
||||
// [t] += [t-1] if t >= 1
|
||||
// [t] += [t-2] if t >= 2
|
||||
// [t] += [t-4] if t >= 4
|
||||
// ...
|
||||
|
||||
for ( int i = 1 ; i < blockDim.x ; i <<= 1 ) {
|
||||
|
||||
cuda_shfl_up( tmp , local , i , blockDim.x );
|
||||
|
||||
if ( i <= threadIdx.x ) { local += tmp ; }
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/*
|
||||
* Algorithmic constraints:
|
||||
* (a) threads with same threadIdx.y have same value
|
||||
@ -98,7 +246,10 @@ inline void cuda_inter_warp_reduction( ValueType& value,
|
||||
const int max_active_thread = blockDim.y) {
|
||||
|
||||
#define STEP_WIDTH 4
|
||||
__shared__ char sh_result[sizeof(ValueType)*STEP_WIDTH];
|
||||
// Depending on the ValueType _shared__ memory must be aligned up to 8byte boundaries
|
||||
// The reason not to use ValueType directly is that for types with constructors it
|
||||
// could lead to race conditions
|
||||
__shared__ double sh_result[(sizeof(ValueType)+7)/8*STEP_WIDTH];
|
||||
ValueType* result = (ValueType*) & sh_result;
|
||||
const unsigned step = 32 / blockDim.x;
|
||||
unsigned shift = STEP_WIDTH;
|
||||
|
||||
@ -91,7 +91,7 @@ void TaskQueueSpecialization< Kokkos::Cuda >::driver
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
|
||||
task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
|
||||
task.ptr = Queue::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -61,6 +61,8 @@ void set_cuda_task_base_apply_function_pointer
|
||||
|
||||
}
|
||||
|
||||
template< class > class TaskExec ;
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Cuda >
|
||||
{
|
||||
@ -69,6 +71,7 @@ public:
|
||||
using execution_space = Kokkos::Cuda ;
|
||||
using memory_space = Kokkos::CudaUVMSpace ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using member_type = TaskExec< Kokkos::Cuda > ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const ) {}
|
||||
@ -79,13 +82,15 @@ public:
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
|
||||
template< typename FunctorType >
|
||||
template< typename TaskType >
|
||||
static
|
||||
void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
|
||||
typename TaskType::function_type
|
||||
get_function_pointer()
|
||||
{
|
||||
using TaskType = TaskBase< execution_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType > ;
|
||||
using function_type = typename TaskType::function_type ;
|
||||
|
||||
function_type * const ptr =
|
||||
(function_type*) cuda_internal_scratch_unified( sizeof(function_type) );
|
||||
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
@ -93,6 +98,8 @@ public:
|
||||
|
||||
CUDA_SAFE_CALL( cudaGetLastError() );
|
||||
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
|
||||
|
||||
return *ptr ;
|
||||
}
|
||||
};
|
||||
|
||||
@ -435,18 +442,26 @@ void parallel_reduce
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
template< typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda) {
|
||||
const Closure & closure )
|
||||
{
|
||||
// Extract value_type from closure
|
||||
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
using value_type =
|
||||
typename Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::SCAN
|
||||
, void
|
||||
, Closure >::value_type ;
|
||||
|
||||
value_type accum = 0 ;
|
||||
value_type val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
closure(i,val,false);
|
||||
|
||||
// intra-blockDim.y exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
@ -458,7 +473,7 @@ void parallel_scan
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_warp_broadcast<ValueType>(val,
|
||||
local_total = shfl_warp_broadcast<value_type>(val,
|
||||
threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
|
||||
Impl::CudaTraits::WarpSize);
|
||||
|
||||
@ -467,7 +482,7 @@ void parallel_scan
|
||||
if ( threadIdx.y == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
closure(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
@ -478,18 +493,26 @@ void parallel_scan
|
||||
// blockDim.y == team_size
|
||||
// threadIdx.x == position in vec
|
||||
// threadIdx.y == member number
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
template< typename iType, class Closure >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
const Closure & closure )
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, y, local_total;
|
||||
// Extract value_type from closure
|
||||
|
||||
using value_type =
|
||||
typename Kokkos::Impl::FunctorAnalysis
|
||||
< Kokkos::Impl::FunctorPatternInterface::SCAN
|
||||
, void
|
||||
, Closure >::value_type ;
|
||||
|
||||
value_type accum = 0 ;
|
||||
value_type val, y, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
val = 0;
|
||||
lambda(i,val,false);
|
||||
closure(i,val,false);
|
||||
|
||||
// intra-blockDim.x exclusive scan on 'val'
|
||||
// accum = accumulated, sum in total for this iteration
|
||||
@ -501,14 +524,14 @@ void parallel_scan
|
||||
}
|
||||
|
||||
// pass accum to all threads
|
||||
local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
|
||||
local_total = shfl_warp_broadcast<value_type>(val, blockDim.x-1, blockDim.x);
|
||||
|
||||
// make EXCLUSIVE scan by shifting values over one
|
||||
val = Kokkos::shfl_up(val, 1, blockDim.x);
|
||||
if ( threadIdx.x == 0 ) { val = 0 ; }
|
||||
|
||||
val += accum;
|
||||
lambda(i,val,true);
|
||||
closure(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
@ -44,36 +44,47 @@
|
||||
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <initializer_list>
|
||||
|
||||
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_ENABLE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
|
||||
#define KOKKOS_IMPL_MDRANGE_IVDEP
|
||||
#include<impl/KokkosExp_Host_IterateTile.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
#include<Cuda/KokkosExp_Cuda_IterateTile.hpp>
|
||||
#endif
|
||||
|
||||
namespace Kokkos { namespace Experimental {
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
enum class Iterate
|
||||
{
|
||||
Default, // Default for the device
|
||||
Left, // Left indices stride fastest
|
||||
Right, // Right indices stride fastest
|
||||
Flat, // Do not tile, only valid for inner direction
|
||||
};
|
||||
|
||||
template <typename ExecSpace>
|
||||
struct default_outer_direction
|
||||
{
|
||||
using type = Iterate;
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
static constexpr Iterate value = Iterate::Left;
|
||||
#else
|
||||
static constexpr Iterate value = Iterate::Right;
|
||||
#endif
|
||||
};
|
||||
|
||||
template <typename ExecSpace>
|
||||
struct default_inner_direction
|
||||
{
|
||||
using type = Iterate;
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
static constexpr Iterate value = Iterate::Left;
|
||||
#else
|
||||
static constexpr Iterate value = Iterate::Right;
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
@ -86,7 +97,7 @@ struct Rank
|
||||
{
|
||||
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
|
||||
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
|
||||
static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
|
||||
static_assert( N < 7u, "Kokkos Error: Unsupported rank...");
|
||||
|
||||
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
|
||||
|
||||
@ -96,498 +107,236 @@ struct Rank
|
||||
};
|
||||
|
||||
|
||||
|
||||
// multi-dimensional iteration pattern
|
||||
template <typename... Properties>
|
||||
struct MDRangePolicy
|
||||
: public Kokkos::Impl::PolicyTraits<Properties ...>
|
||||
{
|
||||
using traits = Kokkos::Impl::PolicyTraits<Properties ...>;
|
||||
using range_policy = RangePolicy<Properties...>;
|
||||
|
||||
static_assert( !std::is_same<range_policy,void>::value
|
||||
using impl_range_policy = RangePolicy< typename traits::execution_space
|
||||
, typename traits::schedule_type
|
||||
, typename traits::index_type
|
||||
> ;
|
||||
|
||||
static_assert( !std::is_same<typename traits::iteration_pattern,void>::value
|
||||
, "Kokkos Error: MD iteration pattern not defined" );
|
||||
|
||||
using iteration_pattern = typename range_policy::iteration_pattern;
|
||||
using work_tag = typename range_policy::work_tag;
|
||||
using iteration_pattern = typename traits::iteration_pattern;
|
||||
using work_tag = typename traits::work_tag;
|
||||
|
||||
static constexpr int rank = iteration_pattern::rank;
|
||||
|
||||
static constexpr int outer_direction = static_cast<int> (
|
||||
(iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
|
||||
(iteration_pattern::outer_direction != Iterate::Default)
|
||||
? iteration_pattern::outer_direction
|
||||
: default_outer_direction< typename range_policy::execution_space>::value );
|
||||
: default_outer_direction< typename traits::execution_space>::value );
|
||||
|
||||
static constexpr int inner_direction = static_cast<int> (
|
||||
iteration_pattern::inner_direction != Iterate::Default
|
||||
? iteration_pattern::inner_direction
|
||||
: default_inner_direction< typename range_policy::execution_space>::value ) ;
|
||||
: default_inner_direction< typename traits::execution_space>::value ) ;
|
||||
|
||||
|
||||
// Ugly ugly workaround intel 14 not handling scoped enum correctly
|
||||
static constexpr int Flat = static_cast<int>( Iterate::Flat );
|
||||
static constexpr int Right = static_cast<int>( Iterate::Right );
|
||||
static constexpr int Left = static_cast<int>( Iterate::Left );
|
||||
|
||||
using index_type = typename traits::index_type;
|
||||
using array_index_type = long;
|
||||
using point_type = Kokkos::Array<array_index_type,rank>; //was index_type
|
||||
using tile_type = Kokkos::Array<array_index_type,rank>;
|
||||
// If point_type or tile_type is not templated on a signed integral type (if it is unsigned),
|
||||
// then if user passes in intializer_list of runtime-determined values of
|
||||
// signed integral type that are not const will receive a compiler error due
|
||||
// to an invalid case for implicit conversion -
|
||||
// "conversion from integer or unscoped enumeration type to integer type that cannot represent all values of the original, except where source is a constant expression whose value can be stored exactly in the target type"
|
||||
// This would require the user to either pass a matching index_type parameter
|
||||
// as template parameter to the MDRangePolicy or static_cast the individual values
|
||||
|
||||
using size_type = typename range_policy::index_type;
|
||||
using index_type = typename std::make_signed<size_type>::type;
|
||||
|
||||
|
||||
template <typename I>
|
||||
MDRangePolicy( std::initializer_list<I> upper_corner )
|
||||
MDRangePolicy( point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{} )
|
||||
: m_lower(lower)
|
||||
, m_upper(upper)
|
||||
, m_tile(tile)
|
||||
, m_num_tiles(1)
|
||||
{
|
||||
static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
|
||||
//static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
|
||||
|
||||
const auto u = upper_corner.begin();
|
||||
|
||||
m_num_tiles = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(0);
|
||||
m_dim[i] = static_cast<index_type>(u[i]);
|
||||
if (inner_direction != Flat) {
|
||||
// default tile size to 4
|
||||
m_tile[i] = 4;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IA, typename IB>
|
||||
MDRangePolicy( std::initializer_list<IA> corner_a
|
||||
, std::initializer_list<IB> corner_b
|
||||
// Host
|
||||
if ( true
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
|
||||
#endif
|
||||
)
|
||||
{
|
||||
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
|
||||
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
|
||||
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
span = upper[i] - lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
else {
|
||||
m_tile[i] = span;
|
||||
}
|
||||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
}
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
else // Cuda
|
||||
{
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
span = upper[i] - lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
// TODO: determine what is a good default tile size for cuda
|
||||
// may be rank dependent
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
else {
|
||||
m_tile[i] = 16;
|
||||
}
|
||||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
}
|
||||
index_type total_tile_size_check = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
total_tile_size_check *= m_tile[i];
|
||||
}
|
||||
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
|
||||
printf(" Tile dimensions exceed Cuda limits\n");
|
||||
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
using A = typename std::make_signed<IA>::type;
|
||||
using B = typename std::make_signed<IB>::type;
|
||||
template < typename LT , typename UT , typename TT = array_index_type >
|
||||
MDRangePolicy( std::initializer_list<LT> const& lower, std::initializer_list<UT> const& upper, std::initializer_list<TT> const& tile = {} )
|
||||
{
|
||||
#if 0
|
||||
// This should work, less duplicated code but not yet extensively tested
|
||||
point_type lower_tmp, upper_tmp;
|
||||
tile_type tile_tmp;
|
||||
for ( auto i = 0; i < rank; ++i ) {
|
||||
lower_tmp[i] = static_cast<array_index_type>(lower.begin()[i]);
|
||||
upper_tmp[i] = static_cast<array_index_type>(upper.begin()[i]);
|
||||
tile_tmp[i] = static_cast<array_index_type>(tile.begin()[i]);
|
||||
}
|
||||
|
||||
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
|
||||
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
|
||||
MDRangePolicy( lower_tmp, upper_tmp, tile_tmp );
|
||||
|
||||
#else
|
||||
if(m_lower.size()!=rank || m_upper.size() != rank)
|
||||
Kokkos::abort("MDRangePolicy: Constructor initializer lists have wrong size");
|
||||
|
||||
for ( auto i = 0; i < rank; ++i ) {
|
||||
m_lower[i] = static_cast<array_index_type>(lower.begin()[i]);
|
||||
m_upper[i] = static_cast<array_index_type>(upper.begin()[i]);
|
||||
if(tile.size()==rank)
|
||||
m_tile[i] = static_cast<array_index_type>(tile.begin()[i]);
|
||||
else
|
||||
m_tile[i] = 0;
|
||||
}
|
||||
|
||||
m_num_tiles = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
|
||||
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
|
||||
if (inner_direction != Flat) {
|
||||
// default tile size to 4
|
||||
m_tile[i] = 4;
|
||||
} else {
|
||||
m_tile[i] = 1;
|
||||
}
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
}
|
||||
}
|
||||
|
||||
template <typename IA, typename IB, typename T>
|
||||
MDRangePolicy( std::initializer_list<IA> corner_a
|
||||
, std::initializer_list<IB> corner_b
|
||||
, std::initializer_list<T> tile
|
||||
|
||||
// Host
|
||||
if ( true
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename traits::execution_space, Kokkos::Cuda >::value
|
||||
#endif
|
||||
)
|
||||
{
|
||||
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
|
||||
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
|
||||
static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
|
||||
static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
|
||||
|
||||
// TODO check size of lists equal to rank
|
||||
// static_asserts on initializer_list.size() require c++14
|
||||
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
|
||||
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
|
||||
//static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
|
||||
|
||||
using A = typename std::make_signed<IA>::type;
|
||||
using B = typename std::make_signed<IB>::type;
|
||||
|
||||
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
|
||||
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
|
||||
const auto t = tile.begin();
|
||||
|
||||
m_num_tiles = 1;
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
|
||||
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
|
||||
m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
|
||||
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
|
||||
m_num_tiles *= m_tile_dim[i];
|
||||
span = m_upper[i] - m_lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
else {
|
||||
m_tile[i] = span;
|
||||
}
|
||||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
}
|
||||
}
|
||||
#if defined(KOKKOS_ENABLE_CUDA)
|
||||
else // Cuda
|
||||
{
|
||||
index_type span;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
span = m_upper[i] - m_lower[i];
|
||||
if ( m_tile[i] <= 0 ) {
|
||||
// TODO: determine what is a good default tile size for cuda
|
||||
// may be rank dependent
|
||||
if ( (inner_direction == Right && (i < rank-1))
|
||||
|| (inner_direction == Left && (i > 0)) )
|
||||
{
|
||||
m_tile[i] = 2;
|
||||
}
|
||||
else {
|
||||
m_tile[i] = 16;
|
||||
}
|
||||
}
|
||||
m_tile_end[i] = static_cast<index_type>((span + m_tile[i] - 1) / m_tile[i]);
|
||||
m_num_tiles *= m_tile_end[i];
|
||||
}
|
||||
index_type total_tile_size_check = 1;
|
||||
for (int i=0; i<rank; ++i) {
|
||||
total_tile_size_check *= m_tile[i];
|
||||
}
|
||||
if ( total_tile_size_check >= 1024 ) { // improve this check - 1024,1024,64 max per dim (Kepler), but product num_threads < 1024; more restrictions pending register limit
|
||||
printf(" Tile dimensions exceed Cuda limits\n");
|
||||
Kokkos::abort(" Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
//Kokkos::Impl::throw_runtime_exception( " Cuda ExecSpace Error: MDRange tile dims exceed maximum number of threads per block - choose smaller tile dims");
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
}
|
||||
|
||||
index_type m_offset[rank];
|
||||
index_type m_dim[rank];
|
||||
int m_tile[rank];
|
||||
index_type m_tile_dim[rank];
|
||||
size_type m_num_tiles; // product of tile dims
|
||||
|
||||
point_type m_lower;
|
||||
point_type m_upper;
|
||||
tile_type m_tile;
|
||||
point_type m_tile_end;
|
||||
index_type m_num_tiles;
|
||||
};
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
namespace Impl {
|
||||
|
||||
// Serial, Threads, OpenMP
|
||||
// use enable_if to overload for Cuda
|
||||
template < typename MDRange, typename Functor, typename Enable = void >
|
||||
struct MDForFunctor
|
||||
{
|
||||
using work_tag = typename MDRange::work_tag;
|
||||
using index_type = typename MDRange::index_type;
|
||||
using size_type = typename MDRange::size_type;
|
||||
|
||||
MDRange m_range;
|
||||
Functor m_func;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange const& range, Functor const& f )
|
||||
: m_range(range)
|
||||
, m_func( f )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange const& range, Functor && f )
|
||||
: m_range(range)
|
||||
, m_func( std::forward<Functor>(f) )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange && range, Functor const& f )
|
||||
: m_range( std::forward<MDRange>(range) )
|
||||
, m_func( f )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDRange && range, Functor && f )
|
||||
: m_range( std::forward<MDRange>(range) )
|
||||
, m_func( std::forward<Functor>(f) )
|
||||
{}
|
||||
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDForFunctor const& ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor& operator=( MDForFunctor const& ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor( MDForFunctor && ) = default;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
MDForFunctor& operator=( MDForFunctor && ) = default;
|
||||
|
||||
// Rank-2, Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
|
||||
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
|
||||
} else {
|
||||
m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
|
||||
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
|
||||
} else {
|
||||
m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Not Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
index_type t0, t1;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
t0 = t / m_range.m_tile_dim[1];
|
||||
t1 = t % m_range.m_tile_dim[1];
|
||||
} else {
|
||||
t0 = t % m_range.m_tile_dim[0];
|
||||
t1 = t / m_range.m_tile_dim[0];
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
m_func( i0, i1 );
|
||||
}}
|
||||
} else {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( i0, i1 );
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-2, Not Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 2
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
work_tag tag;
|
||||
|
||||
index_type t0, t1;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
t0 = t / m_range.m_tile_dim[1];
|
||||
t1 = t % m_range.m_tile_dim[1];
|
||||
} else {
|
||||
t0 = t % m_range.m_tile_dim[0];
|
||||
t1 = t / m_range.m_tile_dim[0];
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
m_func( tag, i0, i1 );
|
||||
}}
|
||||
} else {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( tag, i0, i1 );
|
||||
}}
|
||||
}
|
||||
}
|
||||
|
||||
//---------------------------------------------------------------------------
|
||||
|
||||
// Rank-3, Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
|
||||
m_func( m_range.m_offset[0] + ( t / tmp_prod )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
|
||||
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
|
||||
);
|
||||
} else {
|
||||
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
|
||||
m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
|
||||
, m_range.m_offset[2] + ( t / tmp_prod )
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction == MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
|
||||
m_func( work_tag{}
|
||||
, m_range.m_offset[0] + ( t / tmp_prod )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
|
||||
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
|
||||
);
|
||||
} else {
|
||||
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
|
||||
m_func( work_tag{}
|
||||
, m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
|
||||
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
|
||||
, m_range.m_offset[2] + ( t / tmp_prod )
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Not Flat, No Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
index_type t0, t1, t2;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
|
||||
t0 = t / tmp_prod;
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
|
||||
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
|
||||
} else {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
|
||||
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
|
||||
t2 = t / tmp_prod;
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
m_func( i0, i1, i2 );
|
||||
}}}
|
||||
} else {
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( i0, i1, i2 );
|
||||
}}}
|
||||
}
|
||||
}
|
||||
|
||||
// Rank-3, Not Flat, Tag
|
||||
template <typename Idx>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
typename std::enable_if<( std::is_integral<Idx>::value
|
||||
&& !std::is_same<void, work_tag>::value
|
||||
&& MDRange::rank == 3
|
||||
&& MDRange::inner_direction != MDRange::Flat
|
||||
)>::type
|
||||
operator()(Idx t) const
|
||||
{
|
||||
work_tag tag;
|
||||
|
||||
index_type t0, t1, t2;
|
||||
if ( MDRange::outer_direction == MDRange::Right ) {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
|
||||
t0 = t / tmp_prod;
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
|
||||
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
|
||||
} else {
|
||||
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
|
||||
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
|
||||
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
|
||||
t2 = t / tmp_prod;
|
||||
}
|
||||
|
||||
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
|
||||
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
|
||||
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
|
||||
|
||||
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
|
||||
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
|
||||
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
|
||||
|
||||
if ( MDRange::inner_direction == MDRange::Right ) {
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
m_func( tag, i0, i1, i2 );
|
||||
}}}
|
||||
} else {
|
||||
for (int i2=b2; i2<e2; ++i2) {
|
||||
for (int i1=b1; i1<e1; ++i1) {
|
||||
#if defined(KOKKOS_IMPL_MDRANGE_IVDEP)
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for (int i0=b0; i0<e0; ++i0) {
|
||||
m_func( tag, i0, i1, i2 );
|
||||
}}}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
|
||||
template <typename MDRange, typename Functor>
|
||||
// ------------------------------------------------------------------ //
|
||||
//md_parallel_for
|
||||
// ------------------------------------------------------------------ //
|
||||
template <typename MDRange, typename Functor, typename Enable = void>
|
||||
void md_parallel_for( MDRange const& range
|
||||
, Functor const& f
|
||||
, const std::string& str = ""
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDForFunctor<MDRange, Functor> g(range, f);
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::range_policy;
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
}
|
||||
@ -596,15 +345,132 @@ template <typename MDRange, typename Functor>
|
||||
void md_parallel_for( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDForFunctor<MDRange, Functor> g(range, f);
|
||||
Impl::MDFunctor<MDRange, Functor, void> g(range, f);
|
||||
|
||||
using range_policy = typename MDRange::range_policy;
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
|
||||
}
|
||||
|
||||
// Cuda specialization
|
||||
#if defined( __CUDACC__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
template <typename MDRange, typename Functor>
|
||||
void md_parallel_for( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
closure.execute();
|
||||
}
|
||||
|
||||
template <typename MDRange, typename Functor>
|
||||
void md_parallel_for( MDRange const& range
|
||||
, Functor const& f
|
||||
, const std::string& str = ""
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f);
|
||||
closure.execute();
|
||||
}
|
||||
#endif
|
||||
// ------------------------------------------------------------------ //
|
||||
|
||||
// ------------------------------------------------------------------ //
|
||||
//md_parallel_reduce
|
||||
// ------------------------------------------------------------------ //
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, const std::string& str = ""
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
|
||||
}
|
||||
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& !std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::MDFunctor<MDRange, Functor, ValueType> g(range, f, v);
|
||||
|
||||
//using range_policy = typename MDRange::range_policy;
|
||||
using range_policy = typename MDRange::impl_range_policy;
|
||||
|
||||
Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v );
|
||||
}
|
||||
|
||||
// Cuda - parallel_reduce not implemented yet
|
||||
/*
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, const std::string& str = ""
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
|
||||
closure.execute();
|
||||
}
|
||||
|
||||
template <typename MDRange, typename Functor, typename ValueType>
|
||||
void md_parallel_reduce( const std::string& str
|
||||
, MDRange const& range
|
||||
, Functor const& f
|
||||
, ValueType & v
|
||||
, typename std::enable_if<( true
|
||||
#if defined( KOKKOS_ENABLE_CUDA)
|
||||
&& std::is_same< typename MDRange::range_policy::execution_space, Kokkos::Cuda>::value
|
||||
#endif
|
||||
) >::type* = 0
|
||||
)
|
||||
{
|
||||
Impl::DeviceIterateTile<MDRange, Functor, typename MDRange::work_tag> closure(range, f, v);
|
||||
closure.execute();
|
||||
}
|
||||
*/
|
||||
|
||||
}} // namespace Kokkos::Experimental
|
||||
|
||||
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
|
||||
|
||||
@ -59,8 +59,14 @@ template< class T = void
|
||||
, class Proxy = void
|
||||
>
|
||||
struct Array {
|
||||
private:
|
||||
T m_elem[N];
|
||||
public:
|
||||
/**
|
||||
* The elements of this C array shall not be accessed directly. The data
|
||||
* member has to be declared public to enable aggregate initialization as for
|
||||
* std::array. We mark it as private in the documentation.
|
||||
* @private
|
||||
*/
|
||||
T m_internal_implementation_private_member_data[N];
|
||||
public:
|
||||
|
||||
typedef T & reference ;
|
||||
@ -78,25 +84,32 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
reference operator[]( const iType & i )
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
return m_elem[i];
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_internal_implementation_private_member_data[i];
|
||||
}
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const_reference operator[]( const iType & i ) const
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
return m_elem[i];
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_internal_implementation_private_member_data[i];
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION pointer data() { return & m_elem[0] ; }
|
||||
KOKKOS_INLINE_FUNCTION const_pointer data() const { return & m_elem[0] ; }
|
||||
KOKKOS_INLINE_FUNCTION pointer data()
|
||||
{
|
||||
return & m_internal_implementation_private_member_data[0];
|
||||
}
|
||||
KOKKOS_INLINE_FUNCTION const_pointer data() const
|
||||
{
|
||||
return & m_internal_implementation_private_member_data[0];
|
||||
}
|
||||
|
||||
~Array() = default ;
|
||||
Array() = default ;
|
||||
Array( const Array & ) = default ;
|
||||
Array & operator = ( const Array & ) = default ;
|
||||
// Do not default unless move and move-assignment are also defined
|
||||
// ~Array() = default ;
|
||||
// Array() = default ;
|
||||
// Array( const Array & ) = default ;
|
||||
// Array & operator = ( const Array & ) = default ;
|
||||
|
||||
// Some supported compilers are not sufficiently C++11 compliant
|
||||
// for default move constructor and move assignment operator.
|
||||
@ -124,7 +137,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type operator[]( const iType & )
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integer argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
|
||||
return value_type();
|
||||
}
|
||||
|
||||
@ -132,7 +145,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type operator[]( const iType & ) const
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integer argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integer argument" );
|
||||
return value_type();
|
||||
}
|
||||
|
||||
@ -181,7 +194,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
reference operator[]( const iType & i )
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_elem[i];
|
||||
}
|
||||
|
||||
@ -189,7 +202,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const_reference operator[]( const iType & i ) const
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_elem[i];
|
||||
}
|
||||
|
||||
@ -250,7 +263,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
reference operator[]( const iType & i )
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_elem[i*m_stride];
|
||||
}
|
||||
|
||||
@ -258,7 +271,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const_reference operator[]( const iType & i ) const
|
||||
{
|
||||
static_assert( std::is_integral<iType>::value , "Must be integral argument" );
|
||||
static_assert( ( std::is_integral<iType>::value || std::is_enum<iType>::value ) , "Must be integral argument" );
|
||||
return m_elem[i*m_stride];
|
||||
}
|
||||
|
||||
|
||||
@ -102,6 +102,7 @@ KOKKOS_IMPL_IS_CONCEPT( memory_traits )
|
||||
KOKKOS_IMPL_IS_CONCEPT( execution_space )
|
||||
KOKKOS_IMPL_IS_CONCEPT( execution_policy )
|
||||
KOKKOS_IMPL_IS_CONCEPT( array_layout )
|
||||
KOKKOS_IMPL_IS_CONCEPT( reducer )
|
||||
|
||||
namespace Impl {
|
||||
|
||||
|
||||
@ -57,6 +57,10 @@
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
#include <Kokkos_Qthreads.hpp>
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#include <Kokkos_Threads.hpp>
|
||||
#endif
|
||||
@ -76,6 +80,7 @@
|
||||
|
||||
#include <Kokkos_Complex.hpp>
|
||||
|
||||
#include <iosfwd>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -105,6 +110,9 @@ void finalize_all();
|
||||
|
||||
void fence();
|
||||
|
||||
/** \brief Print "Bill of Materials" */
|
||||
void print_configuration( std::ostream & , const bool detail = false );
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -159,4 +167,3 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
@ -63,7 +63,7 @@ namespace Kokkos {
|
||||
|
||||
struct AUTO_t {
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr const AUTO_t & operator()() const { return *this ; }
|
||||
constexpr const AUTO_t & operator()() const { return *this; }
|
||||
};
|
||||
|
||||
namespace {
|
||||
@ -73,46 +73,49 @@ constexpr AUTO_t AUTO = Kokkos::AUTO_t();
|
||||
|
||||
struct InvalidType {};
|
||||
|
||||
}
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// Forward declarations for class inter-relationships
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
class HostSpace ; ///< Memory space for main process and CPU execution spaces
|
||||
class HostSpace; ///< Memory space for main process and CPU execution spaces
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
namespace Experimental {
|
||||
class HBWSpace ; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
|
||||
class HBWSpace; /// Memory space for hbw_malloc from memkind (e.g. for KNL processor)
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_SERIAL )
|
||||
class Serial ; ///< Execution space main process on CPU
|
||||
#endif // defined( KOKKOS_ENABLE_SERIAL )
|
||||
class Serial; ///< Execution space main process on CPU.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
class Qthreads; ///< Execution space with Qthreads back-end.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD )
|
||||
class Threads ; ///< Execution space with pthreads back-end
|
||||
class Threads; ///< Execution space with pthreads back-end.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_OPENMP )
|
||||
class OpenMP ; ///< OpenMP execution space
|
||||
class OpenMP; ///< OpenMP execution space.
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
class CudaSpace ; ///< Memory space on Cuda GPU
|
||||
class CudaUVMSpace ; ///< Memory space on Cuda GPU with UVM
|
||||
class CudaHostPinnedSpace ; ///< Memory space on Host accessible to Cuda GPU
|
||||
class Cuda ; ///< Execution space for Cuda GPU
|
||||
class CudaSpace; ///< Memory space on Cuda GPU
|
||||
class CudaUVMSpace; ///< Memory space on Cuda GPU with UVM
|
||||
class CudaHostPinnedSpace; ///< Memory space on Host accessible to Cuda GPU
|
||||
class Cuda; ///< Execution space for Cuda GPU
|
||||
#endif
|
||||
|
||||
template<class ExecutionSpace, class MemorySpace>
|
||||
struct Device;
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// Set the default execution space.
|
||||
|
||||
@ -122,60 +125,66 @@ struct Device;
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
typedef Cuda DefaultExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Threads DefaultExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultExecutionSpace ;
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
typedef Cuda DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Threads DefaultExecutionSpace;
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
// typedef Qthreads DefaultExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultExecutionSpace;
|
||||
#else
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
#endif
|
||||
|
||||
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Threads DefaultHostExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultHostExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_PTHREAD )
|
||||
typedef Threads DefaultHostExecutionSpace ;
|
||||
#elif defined ( KOKKOS_ENABLE_SERIAL )
|
||||
typedef Serial DefaultHostExecutionSpace ;
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Threads DefaultHostExecutionSpace;
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
// typedef Qthreads DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
typedef Serial DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef OpenMP DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
typedef Threads DefaultHostExecutionSpace;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Qthreads DefaultHostExecutionSpace;
|
||||
#elif defined( KOKKOS_ENABLE_SERIAL )
|
||||
typedef Serial DefaultHostExecutionSpace;
|
||||
#else
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads."
|
||||
# error "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial."
|
||||
#endif
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
// Detect the active execution space and define its memory space.
|
||||
// This is used to verify whether a running kernel can access
|
||||
// a given memory space.
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined (KOKKOS_ENABLE_CUDA)
|
||||
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace ;
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
typedef Kokkos::CudaSpace ActiveExecutionMemorySpace;
|
||||
#elif defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
typedef Kokkos::HostSpace ActiveExecutionMemorySpace ;
|
||||
typedef Kokkos::HostSpace ActiveExecutionMemorySpace;
|
||||
#else
|
||||
typedef void ActiveExecutionMemorySpace ;
|
||||
typedef void ActiveExecutionMemorySpace;
|
||||
#endif
|
||||
|
||||
template< class ActiveSpace , class MemorySpace >
|
||||
template< class ActiveSpace, class MemorySpace >
|
||||
struct VerifyExecutionCanAccessMemorySpace {
|
||||
enum {value = 0};
|
||||
};
|
||||
|
||||
template< class Space >
|
||||
struct VerifyExecutionCanAccessMemorySpace< Space , Space >
|
||||
struct VerifyExecutionCanAccessMemorySpace< Space, Space >
|
||||
{
|
||||
enum {value = 1};
|
||||
KOKKOS_INLINE_FUNCTION static void verify(void) {}
|
||||
@ -183,27 +192,27 @@ struct VerifyExecutionCanAccessMemorySpace< Space , Space >
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE , DATA_PTR ) \
|
||||
#define KOKKOS_RESTRICT_EXECUTION_TO_DATA( DATA_SPACE, DATA_PTR ) \
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify( DATA_PTR )
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify( DATA_PTR )
|
||||
|
||||
#define KOKKOS_RESTRICT_EXECUTION_TO_( DATA_SPACE ) \
|
||||
Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace , DATA_SPACE >::verify()
|
||||
Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE >::verify()
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
void fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class Functor
|
||||
@ -220,18 +229,18 @@ struct FunctorPolicyExecutionSpace;
|
||||
///
|
||||
/// This is an implementation detail of parallel_for. Users should
|
||||
/// skip this and go directly to the nonmember function parallel_for.
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelFor ;
|
||||
template< class FunctorType, class ExecPolicy, class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
|
||||
> class ParallelFor;
|
||||
|
||||
/// \class ParallelReduce
|
||||
/// \brief Implementation detail of parallel_reduce.
|
||||
///
|
||||
/// This is an implementation detail of parallel_reduce. Users should
|
||||
/// skip this and go directly to the nonmember function parallel_reduce.
|
||||
template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelReduce ;
|
||||
template< class FunctorType, class ExecPolicy, class ReducerType = InvalidType, class ExecutionSpace =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
|
||||
> class ParallelReduce;
|
||||
|
||||
/// \class ParallelScan
|
||||
/// \brief Implementation detail of parallel_scan.
|
||||
@ -239,10 +248,12 @@ template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType
|
||||
/// This is an implementation detail of parallel_scan. Users should
|
||||
/// skip this and go directly to the documentation of the nonmember
|
||||
/// template function Kokkos::parallel_scan.
|
||||
template< class FunctorType , class ExecPolicy , class ExecutionSapce =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
|
||||
> class ParallelScan ;
|
||||
template< class FunctorType, class ExecPolicy, class ExecutionSapce =
|
||||
typename Impl::FunctorPolicyExecutionSpace< FunctorType, ExecPolicy >::execution_space
|
||||
> class ParallelScan;
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
}}
|
||||
#endif /* #ifndef KOKKOS_CORE_FWD_HPP */
|
||||
|
||||
|
||||
@ -62,7 +62,6 @@
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
@ -295,6 +294,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#include <Cuda/Kokkos_Cuda_Parallel.hpp>
|
||||
#include <Cuda/Kokkos_Cuda_Task.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) */
|
||||
|
||||
@ -44,14 +44,16 @@
|
||||
#ifndef KOKKOS_HBWSPACE_HPP
|
||||
#define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
@ -67,7 +69,7 @@ void init_lock_array_hbw_space();
|
||||
/// This function tries to aquire the lock for the hash value derived
|
||||
/// from the provided ptr. If the lock is successfully aquired the
|
||||
/// function returns true. Otherwise it returns false.
|
||||
bool lock_address_hbw_space(void* ptr);
|
||||
bool lock_address_hbw_space( void* ptr );
|
||||
|
||||
/// \brief Release lock for the address
|
||||
///
|
||||
@ -75,13 +77,16 @@ bool lock_address_hbw_space(void* ptr);
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
void unlock_address_hbw_space(void* ptr);
|
||||
void unlock_address_hbw_space( void* ptr );
|
||||
|
||||
} // namespace Impl
|
||||
} // neamspace Experimental
|
||||
|
||||
} // namespace Experimental
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Experimental {
|
||||
|
||||
/// \class HBWSpace
|
||||
@ -91,10 +96,9 @@ namespace Experimental {
|
||||
/// memory means the usual CPU-accessible memory.
|
||||
class HBWSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef HBWSpace memory_space ;
|
||||
typedef size_t size_type ;
|
||||
typedef HBWSpace memory_space;
|
||||
typedef size_t size_type;
|
||||
|
||||
/// \typedef execution_space
|
||||
/// \brief Default execution space for this memory space.
|
||||
@ -103,21 +107,25 @@ public:
|
||||
/// useful for things like initializing a View (which happens in
|
||||
/// parallel using the View's default execution space).
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_SERIAL )
|
||||
typedef Kokkos::Serial execution_space ;
|
||||
typedef Kokkos::Serial execution_space;
|
||||
#else
|
||||
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
|
||||
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qhreads, or Kokkos::Serial. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
|
||||
#endif
|
||||
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
typedef Kokkos::Device< execution_space, memory_space > device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HBWSpace */
|
||||
@ -129,67 +137,68 @@ public:
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
HBWSpace();
|
||||
HBWSpace( const HBWSpace & rhs ) = default ;
|
||||
HBWSpace & operator = ( const HBWSpace & ) = default ;
|
||||
~HBWSpace() = default ;
|
||||
HBWSpace( const HBWSpace & rhs ) = default;
|
||||
HBWSpace & operator = ( const HBWSpace & ) = default;
|
||||
~HBWSpace() = default;
|
||||
|
||||
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
|
||||
|
||||
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
|
||||
enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
|
||||
|
||||
explicit
|
||||
HBWSpace( const AllocationMechanism & );
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
void * allocate( const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
, const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech ;
|
||||
AllocationMechanism m_alloc_mech;
|
||||
static constexpr const char* m_name = "HBW";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void > ;
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >;
|
||||
};
|
||||
|
||||
} // namespace Experimental
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
class SharedAllocationRecord< Kokkos::Experimental::HBWSpace, void >
|
||||
: public SharedAllocationRecord< void, void >
|
||||
{
|
||||
private:
|
||||
|
||||
friend Kokkos::Experimental::HBWSpace ;
|
||||
friend Kokkos::Experimental::HBWSpace;
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
typedef SharedAllocationRecord< void, void > RecordBase;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
/**\brief Root record for tracked allocations from this HBWSpace instance */
|
||||
static RecordBase s_root_record ;
|
||||
static RecordBase s_root_record;
|
||||
|
||||
const Kokkos::Experimental::HBWSpace m_space ;
|
||||
const Kokkos::Experimental::HBWSpace m_space;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() = default ;
|
||||
SharedAllocationRecord() = default;
|
||||
|
||||
SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
@ -212,9 +221,9 @@ public:
|
||||
)
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
|
||||
return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
|
||||
#else
|
||||
return (SharedAllocationRecord *) 0 ;
|
||||
return (SharedAllocationRecord *) 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -233,88 +242,93 @@ public:
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::Experimental::HBWSpace & , bool detail = false );
|
||||
static void print_records( std::ostream &, const Kokkos::Experimental::HBWSpace &, bool detail = false );
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::Experimental::HBWSpace >::assignable , "" );
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::Experimental::HBWSpace >::assignable, "" );
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace > {
|
||||
struct MemorySpaceAccess< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace > {
|
||||
enum { assignable = true };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace> {
|
||||
struct MemorySpaceAccess< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace > {
|
||||
enum { assignable = false };
|
||||
enum { accessible = true };
|
||||
enum { deepcopy = true };
|
||||
};
|
||||
|
||||
}}
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Experimental::HBWSpace,Experimental::HBWSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
template< class ExecutionSpace >
|
||||
struct DeepCopy< Experimental::HBWSpace, Experimental::HBWSpace, ExecutionSpace > {
|
||||
DeepCopy( void * dst, const void * src, size_t n ) {
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<HostSpace,Experimental::HBWSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
template< class ExecutionSpace >
|
||||
struct DeepCopy< HostSpace, Experimental::HBWSpace, ExecutionSpace > {
|
||||
DeepCopy( void * dst, const void * src, size_t n ) {
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
};
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<Experimental::HBWSpace,HostSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
template< class ExecutionSpace >
|
||||
struct DeepCopy< Experimental::HBWSpace, HostSpace, ExecutionSpace > {
|
||||
DeepCopy( void * dst, const void * src, size_t n ) {
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experimental::HBWSpace >
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace, Kokkos::Experimental::HBWSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
@ -322,7 +336,7 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::HostSpace , Kokkos::Experime
|
||||
};
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kokkos::HostSpace >
|
||||
struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace, Kokkos::HostSpace >
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
@ -330,8 +344,9 @@ struct VerifyExecutionCanAccessMemorySpace< Kokkos::Experimental::HBWSpace , Kok
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif
|
||||
#endif /* #define KOKKOS_HBWSPACE_HPP */
|
||||
|
||||
#endif // #define KOKKOS_HBWSPACE_HPP
|
||||
|
||||
@ -60,6 +60,7 @@
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
/// \brief Initialize lock array for arbitrary size atomics.
|
||||
@ -83,9 +84,10 @@ bool lock_address_host_space(void* ptr);
|
||||
/// from the provided ptr. This function should only be called
|
||||
/// after previously successfully aquiring a lock with
|
||||
/// lock_address.
|
||||
void unlock_address_host_space(void* ptr);
|
||||
void unlock_address_host_space( void* ptr );
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
@ -97,10 +99,9 @@ namespace Kokkos {
|
||||
/// memory means the usual CPU-accessible memory.
|
||||
class HostSpace {
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos memory space
|
||||
typedef HostSpace memory_space ;
|
||||
typedef size_t size_type ;
|
||||
typedef HostSpace memory_space;
|
||||
typedef size_t size_type;
|
||||
|
||||
/// \typedef execution_space
|
||||
/// \brief Default execution space for this memory space.
|
||||
@ -109,21 +110,25 @@ public:
|
||||
/// useful for things like initializing a View (which happens in
|
||||
/// parallel using the View's default execution space).
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
typedef Kokkos::OpenMP execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
typedef Kokkos::Threads execution_space ;
|
||||
typedef Kokkos::Threads execution_space;
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// typedef Kokkos::Qthreads execution_space;
|
||||
#elif defined( KOKKOS_ENABLE_SERIAL )
|
||||
typedef Kokkos::Serial execution_space ;
|
||||
typedef Kokkos::Serial execution_space;
|
||||
#else
|
||||
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Serial, or Kokkos::Threads. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
|
||||
# error "At least one of the following host execution spaces must be defined: Kokkos::OpenMP, Kokkos::Threads, Kokkos::Qthreads, or Kokkos::Serial. You might be seeing this message if you disabled the Kokkos::Serial device explicitly using the Kokkos_ENABLE_Serial:BOOL=OFF CMake option, but did not enable any of the other host execution space devices."
|
||||
#endif
|
||||
|
||||
//! This memory space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
typedef Kokkos::Device< execution_space, memory_space > device_type;
|
||||
|
||||
/*--------------------------------*/
|
||||
/* Functions unique to the HostSpace */
|
||||
@ -135,61 +140,57 @@ public:
|
||||
|
||||
/**\brief Default memory space instance */
|
||||
HostSpace();
|
||||
HostSpace( HostSpace && rhs ) = default ;
|
||||
HostSpace( const HostSpace & rhs ) = default ;
|
||||
HostSpace & operator = ( HostSpace && ) = default ;
|
||||
HostSpace & operator = ( const HostSpace & ) = default ;
|
||||
~HostSpace() = default ;
|
||||
HostSpace( HostSpace && rhs ) = default;
|
||||
HostSpace( const HostSpace & rhs ) = default;
|
||||
HostSpace & operator = ( HostSpace && ) = default;
|
||||
HostSpace & operator = ( const HostSpace & ) = default;
|
||||
~HostSpace() = default;
|
||||
|
||||
/**\brief Non-default memory space instance to choose allocation mechansim, if available */
|
||||
|
||||
enum AllocationMechanism { STD_MALLOC , POSIX_MEMALIGN , POSIX_MMAP , INTEL_MM_ALLOC };
|
||||
enum AllocationMechanism { STD_MALLOC, POSIX_MEMALIGN, POSIX_MMAP, INTEL_MM_ALLOC };
|
||||
|
||||
explicit
|
||||
HostSpace( const AllocationMechanism & );
|
||||
|
||||
/**\brief Allocate untracked memory in the space */
|
||||
void * allocate( const size_t arg_alloc_size ) const ;
|
||||
void * allocate( const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Deallocate untracked memory in the space */
|
||||
void deallocate( void * const arg_alloc_ptr
|
||||
, const size_t arg_alloc_size ) const ;
|
||||
, const size_t arg_alloc_size ) const;
|
||||
|
||||
/**\brief Return Name of the MemorySpace */
|
||||
static constexpr const char* name();
|
||||
|
||||
private:
|
||||
|
||||
AllocationMechanism m_alloc_mech ;
|
||||
AllocationMechanism m_alloc_mech;
|
||||
static constexpr const char* m_name = "Host";
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > ;
|
||||
friend class Kokkos::Impl::SharedAllocationRecord< Kokkos::HostSpace, void >;
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace , Kokkos::HostSpace >::assignable , "" );
|
||||
|
||||
static_assert( Kokkos::Impl::MemorySpaceAccess< Kokkos::HostSpace, Kokkos::HostSpace >::assignable, "" );
|
||||
|
||||
template< typename S >
|
||||
struct HostMirror {
|
||||
private:
|
||||
|
||||
// If input execution space can access HostSpace then keep it.
|
||||
// Example: Kokkos::OpenMP can access, Kokkos::Cuda cannot
|
||||
enum { keep_exe = Kokkos::Impl::MemorySpaceAccess
|
||||
< typename S::execution_space::memory_space , Kokkos::HostSpace >
|
||||
::accessible };
|
||||
< typename S::execution_space::memory_space, Kokkos::HostSpace >::accessible };
|
||||
|
||||
// If HostSpace can access memory space then keep it.
|
||||
// Example: Cannot access Kokkos::CudaSpace, can access Kokkos::CudaUVMSpace
|
||||
enum { keep_mem = Kokkos::Impl::MemorySpaceAccess
|
||||
< Kokkos::HostSpace , typename S::memory_space >::accessible };
|
||||
< Kokkos::HostSpace, typename S::memory_space >::accessible };
|
||||
|
||||
public:
|
||||
|
||||
@ -202,42 +203,41 @@ public:
|
||||
, typename S::memory_space >
|
||||
, Kokkos::HostSpace
|
||||
>::type
|
||||
>::type Space ;
|
||||
>::type Space;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class SharedAllocationRecord< Kokkos::HostSpace , void >
|
||||
: public SharedAllocationRecord< void , void >
|
||||
class SharedAllocationRecord< Kokkos::HostSpace, void >
|
||||
: public SharedAllocationRecord< void, void >
|
||||
{
|
||||
private:
|
||||
friend Kokkos::HostSpace;
|
||||
|
||||
friend Kokkos::HostSpace ;
|
||||
typedef SharedAllocationRecord< void, void > RecordBase;
|
||||
|
||||
typedef SharedAllocationRecord< void , void > RecordBase ;
|
||||
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete ;
|
||||
SharedAllocationRecord( const SharedAllocationRecord & ) = delete;
|
||||
SharedAllocationRecord & operator = ( const SharedAllocationRecord & ) = delete;
|
||||
|
||||
static void deallocate( RecordBase * );
|
||||
|
||||
/**\brief Root record for tracked allocations from this HostSpace instance */
|
||||
static RecordBase s_root_record ;
|
||||
static RecordBase s_root_record;
|
||||
|
||||
const Kokkos::HostSpace m_space ;
|
||||
const Kokkos::HostSpace m_space;
|
||||
|
||||
protected:
|
||||
|
||||
~SharedAllocationRecord();
|
||||
SharedAllocationRecord() = default ;
|
||||
SharedAllocationRecord() = default;
|
||||
|
||||
SharedAllocationRecord( const Kokkos::HostSpace & arg_space
|
||||
, const std::string & arg_label
|
||||
@ -260,12 +260,13 @@ public:
|
||||
)
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
return new SharedAllocationRecord( arg_space , arg_label , arg_alloc_size );
|
||||
return new SharedAllocationRecord( arg_space, arg_label, arg_alloc_size );
|
||||
#else
|
||||
return (SharedAllocationRecord *) 0 ;
|
||||
return (SharedAllocationRecord *) 0;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
/**\brief Allocate tracked memory in the space */
|
||||
static
|
||||
void * allocate_tracked( const Kokkos::HostSpace & arg_space
|
||||
@ -281,37 +282,37 @@ public:
|
||||
static
|
||||
void deallocate_tracked( void * const arg_alloc_ptr );
|
||||
|
||||
|
||||
static SharedAllocationRecord * get_record( void * arg_alloc_ptr );
|
||||
|
||||
static void print_records( std::ostream & , const Kokkos::HostSpace & , bool detail = false );
|
||||
static void print_records( std::ostream &, const Kokkos::HostSpace &, bool detail = false );
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space> struct DeepCopy ;
|
||||
template< class DstSpace, class SrcSpace, class ExecutionSpace = typename DstSpace::execution_space > struct DeepCopy;
|
||||
|
||||
template<class ExecutionSpace>
|
||||
struct DeepCopy<HostSpace,HostSpace,ExecutionSpace> {
|
||||
DeepCopy( void * dst , const void * src , size_t n ) {
|
||||
memcpy( dst , src , n );
|
||||
template< class ExecutionSpace >
|
||||
struct DeepCopy< HostSpace, HostSpace, ExecutionSpace > {
|
||||
DeepCopy( void * dst, const void * src, size_t n ) {
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst , const void * src , size_t n ) {
|
||||
|
||||
DeepCopy( const ExecutionSpace& exec, void * dst, const void * src, size_t n ) {
|
||||
exec.fence();
|
||||
memcpy( dst , src , n );
|
||||
memcpy( dst, src, n );
|
||||
}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
#endif /* #define KOKKOS_HOSTSPACE_HPP */
|
||||
|
||||
#endif // #define KOKKOS_HOSTSPACE_HPP
|
||||
|
||||
@ -45,22 +45,20 @@
|
||||
#define KOKKOS_MACROS_HPP
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Pick up configure/build options via #define macros:
|
||||
/** Pick up configure / build options via #define macros:
|
||||
*
|
||||
* KOKKOS_ENABLE_CUDA Kokkos::Cuda execution and memory spaces
|
||||
* KOKKOS_ENABLE_PTHREAD Kokkos::Threads execution space
|
||||
* KOKKOS_ENABLE_QTHREAD Kokkos::Qthread execution space
|
||||
* KOKKOS_ENABLE_QTHREADS Kokkos::Qthreads execution space
|
||||
* KOKKOS_ENABLE_OPENMP Kokkos::OpenMP execution space
|
||||
* KOKKOS_ENABLE_HWLOC HWLOC library is available
|
||||
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK insert array bounds checks, is expensive!
|
||||
*
|
||||
* KOKKOS_ENABLE_MPI negotiate MPI/execution space interactions
|
||||
*
|
||||
* KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space
|
||||
* KOKKOS_ENABLE_HWLOC HWLOC library is available.
|
||||
* KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive!
|
||||
* KOKKOS_ENABLE_MPI Negotiate MPI/execution space interactions.
|
||||
* KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space.
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_DONT_INCLUDE_CORE_CONFIG_H
|
||||
#include <KokkosCore_config.h>
|
||||
#include <KokkosCore_config.h>
|
||||
#endif
|
||||
|
||||
#include <impl/Kokkos_OldMacros.hpp>
|
||||
@ -86,7 +84,7 @@
|
||||
* KOKKOS_ENABLE_INTEL_ATOMICS
|
||||
* KOKKOS_ENABLE_OPENMP_ATOMICS
|
||||
*
|
||||
* A suite of 'KOKKOS_HAVE_PRAGMA_...' are defined for internal use.
|
||||
* A suite of 'KOKKOS_ENABLE_PRAGMA_...' are defined for internal use.
|
||||
*
|
||||
* Macros for marking functions to run in an execution space:
|
||||
*
|
||||
@ -98,64 +96,63 @@
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
// Compiling with a CUDA compiler.
|
||||
//
|
||||
// Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
|
||||
// CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
|
||||
//
|
||||
// When generating device code the __CUDA_ARCH__ macro is defined as:
|
||||
// __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
|
||||
|
||||
/* Compiling with a CUDA compiler.
|
||||
*
|
||||
* Include <cuda.h> to pick up the CUDA_VERSION macro defined as:
|
||||
* CUDA_VERSION = ( MAJOR_VERSION * 1000 ) + ( MINOR_VERSION * 10 )
|
||||
*
|
||||
* When generating device code the __CUDA_ARCH__ macro is defined as:
|
||||
* __CUDA_ARCH__ = ( MAJOR_CAPABILITY * 100 ) + ( MINOR_CAPABILITY * 10 )
|
||||
*/
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
|
||||
#include <cuda_runtime.h>
|
||||
#include <cuda.h>
|
||||
#if !defined( CUDA_VERSION )
|
||||
#error "#include <cuda.h> did not define CUDA_VERSION."
|
||||
#endif
|
||||
|
||||
#if ! defined( CUDA_VERSION )
|
||||
#error "#include <cuda.h> did not define CUDA_VERSION"
|
||||
#endif
|
||||
#if ( CUDA_VERSION < 7000 )
|
||||
// CUDA supports C++11 in device code starting with version 7.0.
|
||||
// This includes auto type and device code internal lambdas.
|
||||
#error "Cuda version 7.0 or greater required."
|
||||
#endif
|
||||
|
||||
#if ( CUDA_VERSION < 7000 )
|
||||
// CUDA supports C++11 in device code starting with
|
||||
// version 7.0. This includes auto type and device code internal
|
||||
// lambdas.
|
||||
#error "Cuda version 7.0 or greater required"
|
||||
#endif
|
||||
#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
|
||||
// Compiling with CUDA compiler for device code.
|
||||
#error "Cuda device capability >= 3.0 is required."
|
||||
#endif
|
||||
|
||||
#if defined( __CUDA_ARCH__ ) && ( __CUDA_ARCH__ < 300 )
|
||||
/* Compiling with CUDA compiler for device code. */
|
||||
#error "Cuda device capability >= 3.0 is required"
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
|
||||
#if ( CUDA_VERSION < 7050 )
|
||||
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
|
||||
#if ( CUDA_VERSION < 7050 )
|
||||
// CUDA supports C++11 lambdas generated in host code to be given
|
||||
// to the device starting with version 7.5. But the release candidate (7.5.6)
|
||||
// still identifies as 7.0
|
||||
#error "Cuda version 7.5 or greater required for host-to-device Lambda support"
|
||||
#endif
|
||||
#if ( CUDA_VERSION < 8000 ) && defined(__NVCC__)
|
||||
// still identifies as 7.0.
|
||||
#error "Cuda version 7.5 or greater required for host-to-device Lambda support."
|
||||
#endif
|
||||
|
||||
#if ( CUDA_VERSION < 8000 ) && defined( __NVCC__ )
|
||||
#define KOKKOS_LAMBDA [=]__device__
|
||||
#else
|
||||
#else
|
||||
#define KOKKOS_LAMBDA [=]__host__ __device__
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CXX1Z )
|
||||
#define KOKKOS_CLASS_LAMBDA [=,*this] __host__ __device__
|
||||
#endif
|
||||
#endif
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif /* #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ ) */
|
||||
#endif
|
||||
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif // #if defined( KOKKOS_ENABLE_CUDA ) && defined( __CUDACC__ )
|
||||
|
||||
#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA)
|
||||
#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
// Cuda version 8.0 still needs the functor wrapper
|
||||
#if (KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ ) && defined(__NVCC__)
|
||||
#if /* ( CUDA_VERSION < 8000 ) && */ defined( __NVCC__ )
|
||||
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Language info: C++, CUDA, OPENMP */
|
||||
//----------------------------------------------------------------------------
|
||||
// Language info: C++, CUDA, OPENMP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CUDA )
|
||||
// Compiling Cuda code to 'ptx'
|
||||
@ -163,20 +160,17 @@
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION __device__ __host__ __forceinline__
|
||||
#define KOKKOS_INLINE_FUNCTION __device__ __host__ inline
|
||||
#define KOKKOS_FUNCTION __device__ __host__
|
||||
#endif /* #if defined( __CUDA_ARCH__ ) */
|
||||
#endif // #if defined( __CUDA_ARCH__ )
|
||||
|
||||
#if defined( _OPENMP )
|
||||
// Compiling with OpenMP.
|
||||
// The value of _OPENMP is an integer value YYYYMM
|
||||
// where YYYY and MM are the year and month designation
|
||||
// of the supported OpenMP API version.
|
||||
#endif // #if defined( _OPENMP )
|
||||
|
||||
/* Compiling with OpenMP.
|
||||
* The value of _OPENMP is an integer value YYYYMM
|
||||
* where YYYY and MM are the year and month designation
|
||||
* of the supported OpenMP API version.
|
||||
*/
|
||||
|
||||
#endif /* #if defined( _OPENMP ) */
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Mapping compiler built-ins to KOKKOS_COMPILER_*** macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// Mapping compiler built-ins to KOKKOS_COMPILER_*** macros
|
||||
|
||||
#if defined( __NVCC__ )
|
||||
// NVIDIA compiler is being used.
|
||||
@ -184,29 +178,28 @@
|
||||
// Host code is compiled again with another compiler.
|
||||
// Device code is compile to 'ptx'.
|
||||
#define KOKKOS_COMPILER_NVCC __NVCC__
|
||||
|
||||
#else
|
||||
#if ! defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
#if !defined (KOKKOS_ENABLE_CUDA) // Compiling with clang for Cuda does not work with LAMBDAs either
|
||||
#if !defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA )
|
||||
#if !defined( KOKKOS_ENABLE_CUDA ) // Compiling with clang for Cuda does not work with LAMBDAs either
|
||||
// CUDA (including version 6.5) does not support giving lambdas as
|
||||
// arguments to global functions. Thus its not currently possible
|
||||
// to dispatch lambdas from the host.
|
||||
#define KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA 1
|
||||
#endif
|
||||
#endif
|
||||
#endif /* #if defined( __NVCC__ ) */
|
||||
#endif // #if defined( __NVCC__ )
|
||||
|
||||
#if !defined (KOKKOS_LAMBDA)
|
||||
#if !defined( KOKKOS_LAMBDA )
|
||||
#define KOKKOS_LAMBDA [=]
|
||||
#endif
|
||||
|
||||
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined (KOKKOS_CLASS_LAMBDA)
|
||||
#if defined( KOKKOS_ENABLE_CXX1Z ) && !defined( KOKKOS_CLASS_LAMBDA )
|
||||
#define KOKKOS_CLASS_LAMBDA [=,*this]
|
||||
#endif
|
||||
|
||||
//#if ! defined( __CUDA_ARCH__ ) /* Not compiling Cuda code to 'ptx'. */
|
||||
//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'.
|
||||
|
||||
/* Intel compiler for host code */
|
||||
// Intel compiler for host code.
|
||||
|
||||
#if defined( __INTEL_COMPILER )
|
||||
#define KOKKOS_COMPILER_INTEL __INTEL_COMPILER
|
||||
@ -218,7 +211,7 @@
|
||||
#define KOKKOS_COMPILER_INTEL __ECC
|
||||
#endif
|
||||
|
||||
/* CRAY compiler for host code */
|
||||
// CRAY compiler for host code
|
||||
#if defined( _CRAYC )
|
||||
#define KOKKOS_COMPILER_CRAYC _CRAYC
|
||||
#endif
|
||||
@ -234,38 +227,41 @@
|
||||
#define KOKKOS_COMPILER_APPLECC __APPLE_CC__
|
||||
#endif
|
||||
|
||||
#if defined (__clang__) && !defined (KOKKOS_COMPILER_INTEL)
|
||||
#if defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL )
|
||||
#define KOKKOS_COMPILER_CLANG __clang_major__*100+__clang_minor__*10+__clang_patchlevel__
|
||||
#endif
|
||||
|
||||
#if ! defined( __clang__ ) && ! defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
|
||||
#if !defined( __clang__ ) && !defined( KOKKOS_COMPILER_INTEL ) &&defined( __GNUC__ )
|
||||
#define KOKKOS_COMPILER_GNU __GNUC__*100+__GNUC_MINOR__*10+__GNUC_PATCHLEVEL__
|
||||
|
||||
#if ( 472 > KOKKOS_COMPILER_GNU )
|
||||
#error "Compiling with GCC version earlier than 4.7.2 is not supported."
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined( __PGIC__ ) && ! defined( __GNUC__ )
|
||||
#if defined( __PGIC__ ) && !defined( __GNUC__ )
|
||||
#define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__
|
||||
|
||||
#if ( 1540 > KOKKOS_COMPILER_PGI )
|
||||
#error "Compiling with PGI version earlier than 15.4 is not supported."
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//#endif /* #if ! defined( __CUDA_ARCH__ ) */
|
||||
//#endif // #if !defined( __CUDA_ARCH__ )
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Intel compiler macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// Intel compiler macros
|
||||
|
||||
#if defined( KOKKOS_COMPILER_INTEL )
|
||||
|
||||
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#if ( __INTEL_COMPILER > 1400 )
|
||||
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
#endif
|
||||
|
||||
#define KOKKOS_RESTRICT __restrict__
|
||||
|
||||
#ifndef KOKKOS_ALIGN
|
||||
@ -287,12 +283,13 @@
|
||||
#warning "Compiling with Intel version 13.x probably works but is not officially supported. Official minimal version is 14.0."
|
||||
#endif
|
||||
#endif
|
||||
#if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( _WIN32 )
|
||||
|
||||
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( _WIN32 )
|
||||
#define KOKKOS_ENABLE_ASM 1
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#if !defined (_WIN32)
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#if !defined( _WIN32 )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
|
||||
#else
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline
|
||||
@ -302,192 +299,170 @@
|
||||
#if defined( __MIC__ )
|
||||
// Compiling for Xeon Phi
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Cray compiler macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// Cray compiler macros
|
||||
|
||||
#if defined( KOKKOS_COMPILER_CRAYC )
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* IBM Compiler macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// IBM Compiler macros
|
||||
|
||||
#if defined( KOKKOS_COMPILER_IBM )
|
||||
|
||||
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* CLANG compiler macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// CLANG compiler macros
|
||||
|
||||
#if defined( KOKKOS_COMPILER_CLANG )
|
||||
|
||||
//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* GNU Compiler macros */
|
||||
//----------------------------------------------------------------------------
|
||||
// GNU Compiler macros
|
||||
|
||||
#if defined( KOKKOS_COMPILER_GNU )
|
||||
|
||||
//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline))
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_ENABLE_ASM ) && ! defined( __PGIC__ ) && \
|
||||
( defined( __amd64 ) || \
|
||||
defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || \
|
||||
defined( __x86_64__ ) )
|
||||
#if !defined( KOKKOS_ENABLE_ASM ) && !defined( __PGIC__ ) && \
|
||||
( defined( __amd64 ) || defined( __amd64__ ) || \
|
||||
defined( __x86_64 ) || defined( __x86_64__ ) )
|
||||
#define KOKKOS_ENABLE_ASM 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_COMPILER_PGI )
|
||||
|
||||
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_IVDEP 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1
|
||||
#define KOKKOS_ENABLE_PRAGMA_VECTOR 1
|
||||
//#define KOKKOS_ENABLE_PRAGMA_SIMD 1
|
||||
|
||||
#endif
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if defined( KOKKOS_COMPILER_NVCC )
|
||||
|
||||
#if defined(__CUDA_ARCH__ )
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
#define KOKKOS_ENABLE_PRAGMA_UNROLL 1
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Define function marking macros if compiler specific macros are undefined: */
|
||||
// Define function marking macros if compiler specific macros are undefined:
|
||||
|
||||
#if ! defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline
|
||||
#if !defined( KOKKOS_FORCEINLINE_FUNCTION )
|
||||
#define KOKKOS_FORCEINLINE_FUNCTION inline
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_INLINE_FUNCTION )
|
||||
#define KOKKOS_INLINE_FUNCTION inline
|
||||
#if !defined( KOKKOS_INLINE_FUNCTION )
|
||||
#define KOKKOS_INLINE_FUNCTION inline
|
||||
#endif
|
||||
|
||||
#if ! defined( KOKKOS_FUNCTION )
|
||||
#define KOKKOS_FUNCTION /**/
|
||||
#endif
|
||||
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
///** Define empty macro for restrict if necessary: */
|
||||
|
||||
#if ! defined(KOKKOS_RESTRICT)
|
||||
#define KOKKOS_RESTRICT
|
||||
#if !defined( KOKKOS_FUNCTION )
|
||||
#define KOKKOS_FUNCTION /**/
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Define Macro for alignment: */
|
||||
#if ! defined KOKKOS_ALIGN_SIZE
|
||||
#define KOKKOS_ALIGN_SIZE 16
|
||||
#endif
|
||||
// Define empty macro for restrict if necessary:
|
||||
|
||||
#if ! defined(KOKKOS_ALIGN)
|
||||
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
|
||||
#endif
|
||||
|
||||
#if ! defined(KOKKOS_ALIGN_PTR)
|
||||
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
|
||||
#if !defined( KOKKOS_RESTRICT )
|
||||
#define KOKKOS_RESTRICT
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Determine the default execution space for parallel dispatch.
|
||||
* There is zero or one default execution space specified.
|
||||
*/
|
||||
|
||||
#if 1 < ( ( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
|
||||
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
|
||||
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
|
||||
( defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
|
||||
|
||||
#error "More than one KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_* specified" ;
|
||||
// Define Macro for alignment:
|
||||
|
||||
#if !defined KOKKOS_ALIGN_SIZE
|
||||
#define KOKKOS_ALIGN_SIZE 16
|
||||
#endif
|
||||
|
||||
/** If default is not specified then chose from enabled execution spaces.
|
||||
* Priority: CUDA, OPENMP, THREADS, SERIAL
|
||||
*/
|
||||
#if defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
#elif defined ( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
#elif defined ( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
#elif defined ( KOKKOS_ENABLE_OPENMP )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
|
||||
#elif defined ( KOKKOS_ENABLE_PTHREAD )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
|
||||
#if !defined( KOKKOS_ALIGN )
|
||||
#define KOKKOS_ALIGN(size) __attribute__((aligned(size)))
|
||||
#endif
|
||||
|
||||
#if !defined( KOKKOS_ALIGN_PTR )
|
||||
#define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size)))
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Determine the default execution space for parallel dispatch.
|
||||
// There is zero or one default execution space specified.
|
||||
|
||||
#if 1 < ( ( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS ) ? 1 : 0 ) + \
|
||||
( defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL ) ? 1 : 0 ) )
|
||||
#error "More than one KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_* specified."
|
||||
#endif
|
||||
|
||||
// If default is not specified then chose from enabled execution spaces.
|
||||
// Priority: CUDA, OPENMP, THREADS, QTHREADS, SERIAL
|
||||
#if defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS )
|
||||
//#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS )
|
||||
#elif defined( KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL )
|
||||
#elif defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
#elif defined( KOKKOS_ENABLE_OPENMP )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
|
||||
#elif defined( KOKKOS_ENABLE_PTHREAD )
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
|
||||
//#elif defined( KOKKOS_ENABLE_QTHREADS )
|
||||
// #define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
|
||||
#else
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
|
||||
#define KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** Determine for what space the code is being compiled: */
|
||||
// Determine for what space the code is being compiled:
|
||||
|
||||
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined (KOKKOS_ENABLE_CUDA)
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
#if defined( __CUDACC__ ) && defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA )
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA
|
||||
#else
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
#define KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#if ( defined( _POSIX_C_SOURCE ) && _POSIX_C_SOURCE >= 200112L ) || \
|
||||
( defined( _XOPEN_SOURCE ) && _XOPEN_SOURCE >= 600 )
|
||||
#if defined(KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN)
|
||||
#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_PERFORMANCE_POSIX_MEMALIGN )
|
||||
#define KOKKOS_ENABLE_POSIX_MEMALIGN 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/**Enable Profiling by default**/
|
||||
// Enable Profiling by default
|
||||
|
||||
#ifndef KOKKOS_ENABLE_PROFILING
|
||||
#define KOKKOS_ENABLE_PROFILING 1
|
||||
#define KOKKOS_ENABLE_PROFILING 1
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #ifndef KOKKOS_MACROS_HPP */
|
||||
|
||||
#endif // #ifndef KOKKOS_MACROS_HPP
|
||||
|
||||
@ -1294,6 +1294,7 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_t get_min_block_size() const { return MIN_BLOCK_SIZE; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
size_t get_mem_size() const { return m_data_size; }
|
||||
|
||||
private:
|
||||
|
||||
@ -66,7 +66,6 @@
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
@ -196,6 +195,7 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
|
||||
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_OPENMP ) && defined( _OPENMP ) */
|
||||
|
||||
@ -78,16 +78,14 @@ struct pair
|
||||
/// This calls the default constructors of T1 and T2. It won't
|
||||
/// compile if those default constructors are not defined and
|
||||
/// public.
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
pair()
|
||||
: first(), second()
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair() = default ;
|
||||
|
||||
/// \brief Constructor that takes both elements of the pair.
|
||||
///
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(first_type const& f, second_type const& s)
|
||||
: first(f), second(s)
|
||||
{}
|
||||
@ -97,7 +95,7 @@ struct pair
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const pair<U,V> &p)
|
||||
: first(p.first), second(p.second)
|
||||
{}
|
||||
@ -107,7 +105,7 @@ struct pair
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const volatile pair<U,V> &p)
|
||||
: first(p.first), second(p.second)
|
||||
{}
|
||||
@ -183,7 +181,7 @@ struct pair<T1&, T2&>
|
||||
///
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(first_type f, second_type s)
|
||||
: first(f), second(s)
|
||||
{}
|
||||
@ -193,7 +191,7 @@ struct pair<T1&, T2&>
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const pair<U,V> &p)
|
||||
: first(p.first), second(p.second)
|
||||
{}
|
||||
@ -247,7 +245,7 @@ struct pair<T1, T2&>
|
||||
///
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(first_type const& f, second_type s)
|
||||
: first(f), second(s)
|
||||
{}
|
||||
@ -257,7 +255,7 @@ struct pair<T1, T2&>
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const pair<U,V> &p)
|
||||
: first(p.first), second(p.second)
|
||||
{}
|
||||
@ -311,7 +309,7 @@ struct pair<T1&, T2>
|
||||
///
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(first_type f, second_type const& s)
|
||||
: first(f), second(s)
|
||||
{}
|
||||
@ -321,7 +319,7 @@ struct pair<T1&, T2>
|
||||
/// This calls the copy constructors of T1 and T2. It won't compile
|
||||
/// if those copy constructors are not defined and public.
|
||||
template <class U, class V>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const pair<U,V> &p)
|
||||
: first(p.first), second(p.second)
|
||||
{}
|
||||
@ -366,31 +364,31 @@ bool operator== (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
|
||||
//! Inequality operator for Kokkos::pair.
|
||||
template <class T1, class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator!= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
{ return !(lhs==rhs); }
|
||||
|
||||
//! Less-than operator for Kokkos::pair.
|
||||
template <class T1, class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator< (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
{ return lhs.first<rhs.first || (!(rhs.first<lhs.first) && lhs.second<rhs.second); }
|
||||
|
||||
//! Less-than-or-equal-to operator for Kokkos::pair.
|
||||
template <class T1, class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator<= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
{ return !(rhs<lhs); }
|
||||
|
||||
//! Greater-than operator for Kokkos::pair.
|
||||
template <class T1, class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator> (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
{ return rhs<lhs; }
|
||||
|
||||
//! Greater-than-or-equal-to operator for Kokkos::pair.
|
||||
template <class T1, class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
{ return !(lhs<rhs); }
|
||||
|
||||
@ -399,7 +397,7 @@ bool operator>= (const pair<T1,T2>& lhs, const pair<T1,T2>& rhs)
|
||||
/// This is a "nonmember constructor" for Kokkos::pair. It works just
|
||||
/// like std::make_pair.
|
||||
template <class T1,class T2>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair<T1,T2> make_pair (T1 x, T2 y)
|
||||
{ return ( pair<T1,T2>(x,y) ); }
|
||||
|
||||
@ -460,23 +458,21 @@ struct pair<T1,void>
|
||||
first_type first;
|
||||
enum { second = 0 };
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
pair()
|
||||
: first()
|
||||
{}
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair() = default ;
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(const first_type & f)
|
||||
: first(f)
|
||||
{}
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair(const first_type & f, int)
|
||||
: first(f)
|
||||
{}
|
||||
|
||||
template <class U>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
pair( const pair<U,void> &p)
|
||||
: first(p.first)
|
||||
{}
|
||||
@ -495,32 +491,32 @@ struct pair<T1,void>
|
||||
//
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator== (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return lhs.first==rhs.first; }
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator!= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return !(lhs==rhs); }
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator< (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return lhs.first<rhs.first; }
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator<= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return !(rhs<lhs); }
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator> (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return rhs<lhs; }
|
||||
|
||||
template <class T1>
|
||||
KOKKOS_FORCEINLINE_FUNCTION
|
||||
KOKKOS_FORCEINLINE_FUNCTION constexpr
|
||||
bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
{ return !(lhs<rhs); }
|
||||
|
||||
@ -528,3 +524,4 @@ bool operator>= (const pair<T1,void>& lhs, const pair<T1,void>& rhs)
|
||||
|
||||
|
||||
#endif //KOKKOS_PAIR_HPP
|
||||
|
||||
|
||||
@ -52,13 +52,14 @@
|
||||
#include <Kokkos_View.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#include <typeinfo>
|
||||
#endif
|
||||
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_FunctorAnalysis.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#ifdef KOKKOS_DEBUG
|
||||
@ -175,7 +176,7 @@ void parallel_for( const ExecPolicy & policy
|
||||
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
|
||||
)
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
@ -188,7 +189,7 @@ void parallel_for( const ExecPolicy & policy
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
@ -207,7 +208,7 @@ void parallel_for( const size_t work_count
|
||||
execution_space ;
|
||||
typedef RangePolicy< execution_space > policy ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
@ -220,7 +221,7 @@ void parallel_for( const size_t work_count
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelFor(kpID);
|
||||
}
|
||||
@ -417,7 +418,7 @@ void parallel_scan( const ExecutionPolicy & policy
|
||||
, typename Impl::enable_if< ! Impl::is_integral< ExecutionPolicy >::value >::type * = 0
|
||||
)
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
@ -430,7 +431,7 @@ void parallel_scan( const ExecutionPolicy & policy
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
@ -450,7 +451,7 @@ void parallel_scan( const size_t work_count
|
||||
|
||||
typedef Kokkos::RangePolicy< execution_space > policy ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
|
||||
@ -463,7 +464,7 @@ void parallel_scan( const size_t work_count
|
||||
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelScan(kpID);
|
||||
}
|
||||
|
||||
@ -1094,7 +1094,7 @@ namespace Impl {
|
||||
const PolicyType& policy,
|
||||
const FunctorType& functor,
|
||||
ReturnType& return_value) {
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
uint64_t kpID = 0;
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::beginParallelReduce("" == label ? typeid(FunctorType).name() : label, 0, &kpID);
|
||||
@ -1116,7 +1116,7 @@ namespace Impl {
|
||||
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
|
||||
closure.execute();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::endParallelReduce(kpID);
|
||||
}
|
||||
|
||||
@ -41,52 +41,70 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_QTHREAD_HPP
|
||||
#define KOKKOS_QTHREAD_HPP
|
||||
#ifndef KOKKOS_QTHREADS_HPP
|
||||
#define KOKKOS_QTHREADS_HPP
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
|
||||
// Defines to enable experimental Qthreads functionality.
|
||||
#define QTHREAD_LOCAL_PRIORITY
|
||||
#define CLONED_TASKS
|
||||
|
||||
#include <qthread.h>
|
||||
|
||||
#include <cstddef>
|
||||
#include <iosfwd>
|
||||
#include <Kokkos_Core.hpp>
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
|
||||
#include <Kokkos_HostSpace.hpp>
|
||||
#include <Kokkos_ExecPolicy.hpp>
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
//#include <Kokkos_MemoryTraits.hpp>
|
||||
//#include <Kokkos_ExecPolicy.hpp>
|
||||
//#include <Kokkos_TaskScheduler.hpp> // Uncomment when Tasking working.
|
||||
#include <Kokkos_Layout.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <KokkosExp_MDRangePolicy.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
class QthreadExec ;
|
||||
|
||||
class QthreadsExec;
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/** \brief Execution space supported by Qthread */
|
||||
class Qthread {
|
||||
/** \brief Execution space supported by Qthreads */
|
||||
class Qthreads {
|
||||
public:
|
||||
//! \name Type declarations that all Kokkos devices must provide.
|
||||
//@{
|
||||
|
||||
//! Tag this class as an execution space
|
||||
typedef Qthread execution_space ;
|
||||
typedef Kokkos::HostSpace memory_space ;
|
||||
typedef Qthreads execution_space;
|
||||
typedef Kokkos::HostSpace memory_space;
|
||||
//! This execution space preferred device_type
|
||||
typedef Kokkos::Device<execution_space,memory_space> device_type;
|
||||
typedef Kokkos::Device< execution_space, memory_space > device_type;
|
||||
|
||||
typedef Kokkos::LayoutRight array_layout ;
|
||||
typedef memory_space::size_type size_type ;
|
||||
typedef Kokkos::LayoutRight array_layout;
|
||||
typedef memory_space::size_type size_type;
|
||||
|
||||
typedef ScratchMemorySpace< Qthread > scratch_memory_space ;
|
||||
typedef ScratchMemorySpace< Qthreads > scratch_memory_space;
|
||||
|
||||
//@}
|
||||
/*------------------------------------------------------------------------*/
|
||||
|
||||
/** \brief Initialization will construct one or more instances */
|
||||
static Qthread & instance( int = 0 );
|
||||
static Qthreads & instance( int = 0 );
|
||||
|
||||
/** \brief Set the execution space to a "sleep" state.
|
||||
*
|
||||
@ -128,26 +146,24 @@ public:
|
||||
static void finalize();
|
||||
|
||||
/** \brief Print configuration information to the given output stream. */
|
||||
static void print_configuration( std::ostream & , const bool detail = false );
|
||||
static void print_configuration( std::ostream &, const bool detail = false );
|
||||
|
||||
int shepherd_size() const ;
|
||||
int shepherd_worker_size() const ;
|
||||
int shepherd_size() const;
|
||||
int shepherd_worker_size() const;
|
||||
};
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
struct MemorySpaceAccess
|
||||
< Kokkos::Qthread::memory_space
|
||||
, Kokkos::Qthread::scratch_memory_space
|
||||
< Kokkos::Qthreads::memory_space
|
||||
, Kokkos::Qthreads::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { assignable = false };
|
||||
@ -157,27 +173,26 @@ struct MemorySpaceAccess
|
||||
|
||||
template<>
|
||||
struct VerifyExecutionCanAccessMemorySpace
|
||||
< Kokkos::Qthread::memory_space
|
||||
, Kokkos::Qthread::scratch_memory_space
|
||||
< Kokkos::Qthreads::memory_space
|
||||
, Kokkos::Qthreads::scratch_memory_space
|
||||
>
|
||||
{
|
||||
enum { value = true };
|
||||
inline static void verify( void ) { }
|
||||
inline static void verify( const void * ) { }
|
||||
inline static void verify( void ) {}
|
||||
inline static void verify( const void * ) {}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <Qthread/Kokkos_QthreadExec.hpp>
|
||||
#include <Qthread/Kokkos_Qthread_Parallel.hpp>
|
||||
#include <Qthreads/Kokkos_QthreadsExec.hpp>
|
||||
#include <Qthreads/Kokkos_Qthreads_Parallel.hpp>
|
||||
//#include <Qthreads/Kokkos_Qthreads_Task.hpp> // Uncomment when Tasking working.
|
||||
//#include <Qthreads/Kokkos_Qthreads_TaskQueue.hpp> // Uncomment when Tasking working.
|
||||
|
||||
#endif /* #define KOKKOS_QTHREAD_HPP */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
#endif // #define KOKKOS_ENABLE_QTHREADS
|
||||
|
||||
#endif // #define KOKKOS_QTHREADS_HPP
|
||||
@ -56,6 +56,8 @@
|
||||
#include <Kokkos_ScratchSpace.hpp>
|
||||
#include <Kokkos_MemoryTraits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
#include <impl/Kokkos_FunctorAnalysis.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
@ -138,30 +140,15 @@ public:
|
||||
static void initialize( unsigned threads_count = 1 ,
|
||||
unsigned use_numa_count = 0 ,
|
||||
unsigned use_cores_per_numa = 0 ,
|
||||
bool allow_asynchronous_threadpool = false) {
|
||||
(void) threads_count;
|
||||
(void) use_numa_count;
|
||||
(void) use_cores_per_numa;
|
||||
(void) allow_asynchronous_threadpool;
|
||||
bool allow_asynchronous_threadpool = false);
|
||||
|
||||
// Init the array of locks used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
static int is_initialized() { return 1 ; }
|
||||
static int is_initialized();
|
||||
|
||||
/** \brief Return the maximum amount of concurrency. */
|
||||
static int concurrency() {return 1;};
|
||||
|
||||
//! Free any resources being consumed by the device.
|
||||
static void finalize() {
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
static void finalize();
|
||||
|
||||
//! Print configuration information to the given output stream.
|
||||
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
|
||||
@ -177,10 +164,6 @@ public:
|
||||
inline static unsigned max_hardware_threads() { return thread_pool_size(0); }
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
static void * scratch_memory_resize( unsigned reduce_size , unsigned shared_size );
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
@ -213,22 +196,6 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
inline static void verify( const void * ) { }
|
||||
};
|
||||
|
||||
namespace SerialImpl {
|
||||
|
||||
struct Sentinel {
|
||||
|
||||
void * m_scratch ;
|
||||
unsigned m_reduce_end ;
|
||||
unsigned m_shared_end ;
|
||||
|
||||
Sentinel();
|
||||
~Sentinel();
|
||||
static Sentinel & singleton();
|
||||
};
|
||||
|
||||
inline
|
||||
unsigned align( unsigned n );
|
||||
}
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -238,89 +205,26 @@ unsigned align( unsigned n );
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class SerialTeamMember {
|
||||
private:
|
||||
typedef Kokkos::ScratchMemorySpace< Kokkos::Serial > scratch_memory_space ;
|
||||
const scratch_memory_space m_space ;
|
||||
const int m_league_rank ;
|
||||
const int m_league_size ;
|
||||
// Resize thread team data scratch memory
|
||||
void serial_resize_thread_team_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes );
|
||||
|
||||
SerialTeamMember & operator = ( const SerialTeamMember & );
|
||||
HostThreadTeamData * serial_get_thread_team_data();
|
||||
|
||||
public:
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const scratch_memory_space & team_shmem() const { return m_space ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const scratch_memory_space & team_scratch(int) const
|
||||
{ return m_space ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const scratch_memory_space & thread_scratch(int) const
|
||||
{ return m_space ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast(const ValueType& , const int& ) const {}
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType team_reduce( const ValueType & value , const JoinOp & ) const
|
||||
{
|
||||
return value ;
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
|
||||
{
|
||||
const Type tmp = global_accum ? *global_accum : Type(0) ;
|
||||
if ( global_accum ) { *global_accum += value ; }
|
||||
return tmp ;
|
||||
}
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & ) const
|
||||
{ return Type(0); }
|
||||
|
||||
//----------------------------------------
|
||||
// Execution space specific:
|
||||
|
||||
SerialTeamMember( int arg_league_rank
|
||||
, int arg_league_size
|
||||
, int arg_shared_size
|
||||
);
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/*
|
||||
* < Kokkos::Serial , WorkArgTag >
|
||||
* < WorkArgTag , Impl::enable_if< std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value >::type >
|
||||
*
|
||||
*/
|
||||
namespace Impl {
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<Properties...>
|
||||
{
|
||||
@ -441,14 +345,11 @@ public:
|
||||
return p;
|
||||
};
|
||||
|
||||
typedef Impl::SerialTeamMember member_type ;
|
||||
typedef Impl::HostThreadTeamMember< Kokkos::Serial > member_type ;
|
||||
};
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Parallel patterns for Kokkos::Serial with RangePolicy */
|
||||
@ -521,11 +422,12 @@ private:
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -535,34 +437,25 @@ private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( reference_type update ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( reference_type update ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
@ -570,10 +463,29 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor , m_reducer) );
|
||||
const size_t team_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_shared_size = 0 ; // Never shrinks
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
serial_resize_thread_team_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
HostThreadTeamData & data = *serial_get_thread_team_data();
|
||||
|
||||
pointer_type ptr =
|
||||
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
|
||||
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
this-> template exec< WorkTag >( update );
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class HostViewType >
|
||||
@ -587,7 +499,7 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result_view.ptr_on_device() )
|
||||
, m_result_ptr( arg_result_view.data() )
|
||||
{
|
||||
static_assert( Kokkos::is_view< HostViewType >::value
|
||||
, "Kokkos::Serial reduce result must be a View" );
|
||||
@ -623,11 +535,13 @@ private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -635,10 +549,8 @@ private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( reference_type update ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( i , update , true );
|
||||
@ -648,11 +560,9 @@ private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( reference_type update ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
reference_type update = ValueInit::init( m_functor , ptr );
|
||||
|
||||
const typename Policy::member_type e = m_policy.end();
|
||||
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
|
||||
m_functor( t , i , update , true );
|
||||
@ -664,9 +574,22 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type)
|
||||
Kokkos::Serial::scratch_memory_resize( ValueTraits::value_size( m_functor ) , 0 );
|
||||
this-> template exec< WorkTag >( ptr );
|
||||
const size_t pool_reduce_size = Analysis::value_size( m_functor );
|
||||
const size_t team_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_shared_size = 0 ; // Never shrinks
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
serial_resize_thread_team_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
HostThreadTeamData & data = *serial_get_thread_team_data();
|
||||
|
||||
reference_type update =
|
||||
ValueInit::init( m_functor , pointer_type(data.pool_reduce_local()) );
|
||||
|
||||
this-> template exec< WorkTag >( update );
|
||||
}
|
||||
|
||||
inline
|
||||
@ -696,6 +619,8 @@ class ParallelFor< FunctorType
|
||||
{
|
||||
private:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
typedef TeamPolicyInternal< Kokkos::Serial , Properties ...> Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
@ -706,21 +631,21 @@ private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
exec( HostThreadTeamData & data ) const
|
||||
{
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( Member(ileague,m_league,m_shared) );
|
||||
m_functor( Member(data,ileague,m_league) );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec() const
|
||||
exec( HostThreadTeamData & data ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( t , Member(ileague,m_league,m_shared) );
|
||||
m_functor( t , Member(data,ileague,m_league) );
|
||||
}
|
||||
}
|
||||
|
||||
@ -729,15 +654,28 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
Kokkos::Serial::scratch_memory_resize( 0 , m_shared );
|
||||
this-> template exec< typename Policy::work_tag >();
|
||||
const size_t pool_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE ;
|
||||
const size_t team_shared_size = m_shared ;
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
serial_resize_thread_team_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
HostThreadTeamData & data = *serial_get_thread_team_data();
|
||||
|
||||
this->template exec< typename Policy::work_tag >( data );
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor
|
||||
, const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
, m_shared( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
{ }
|
||||
};
|
||||
|
||||
@ -752,18 +690,22 @@ class ParallelReduce< FunctorType
|
||||
{
|
||||
private:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const int m_league ;
|
||||
@ -774,33 +716,23 @@ private:
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( HostThreadTeamData & data , reference_type update ) const
|
||||
{
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( Member(ileague,m_league,m_shared) , update );
|
||||
m_functor( Member(data,ileague,m_league) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class TagType >
|
||||
inline
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec( pointer_type ptr ) const
|
||||
exec( HostThreadTeamData & data , reference_type update ) const
|
||||
{
|
||||
const TagType t{} ;
|
||||
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
|
||||
m_functor( t , Member(ileague,m_league,m_shared) , update );
|
||||
m_functor( t , Member(data,ileague,m_league) , update );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
public:
|
||||
@ -808,10 +740,31 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
|
||||
( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE ;
|
||||
const size_t team_shared_size = m_shared ;
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
serial_resize_thread_team_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
|
||||
HostThreadTeamData & data = *serial_get_thread_team_data();
|
||||
|
||||
pointer_type ptr =
|
||||
m_result_ptr ? m_result_ptr : pointer_type(data.pool_reduce_local());
|
||||
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
this-> template exec< WorkTag >( data , update );
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::
|
||||
final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
}
|
||||
|
||||
template< class ViewType >
|
||||
@ -825,8 +778,10 @@ public:
|
||||
: m_functor( arg_functor )
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
|
||||
, m_result_ptr( arg_result.data() )
|
||||
, m_shared( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
|
||||
{
|
||||
static_assert( Kokkos::is_view< ViewType >::value
|
||||
, "Reduction result on Kokkos::Serial must be a Kokkos::View" );
|
||||
@ -844,7 +799,9 @@ public:
|
||||
, m_league( arg_policy.league_size() )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shared( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
@ -858,261 +815,6 @@ public:
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/*--------------------------------------------------------------------------*/
|
||||
/* Nested parallel patterns for Kokkos::Serial with TeamPolicy */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType,SerialTeamMember> {
|
||||
typedef iType index_type;
|
||||
const iType begin ;
|
||||
const iType end ;
|
||||
enum {increment = 1};
|
||||
const SerialTeamMember& thread;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_count)
|
||||
: begin(0)
|
||||
, end(arg_count)
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct (const SerialTeamMember& arg_thread, const iType& arg_begin, const iType & arg_end )
|
||||
: begin( arg_begin )
|
||||
, end( arg_end)
|
||||
, thread( arg_thread )
|
||||
{}
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType,SerialTeamMember> {
|
||||
typedef iType index_type;
|
||||
enum {start = 0};
|
||||
const iType end;
|
||||
enum {increment = 1};
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct (const SerialTeamMember& thread, const iType& count):
|
||||
end( count )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>
|
||||
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::SerialTeamMember >
|
||||
TeamThreadRange( const Impl::SerialTeamMember& thread, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SerialTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >
|
||||
ThreadVectorRange(const Impl::SerialTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::SerialTeamMember> PerTeam(const Impl::SerialTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::SerialTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::SerialTeamMember> PerThread(const Impl::SerialTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::SerialTeamMember>(thread);
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
|
||||
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
|
||||
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::SerialTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
|
||||
for( iType i = loop_boundaries.begin; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
|
||||
init_result = loop_boundaries.thread.team_reduce(result,Impl::JoinLambdaAdapter<ValueType,JoinType>(join));
|
||||
}
|
||||
|
||||
} //namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
|
||||
loop_boundaries, const Lambda& lambda) {
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
init_result = result;
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
* for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
|
||||
* Depending on the target execution space the operator might be called twice: once with final=false
|
||||
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
|
||||
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
|
||||
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
|
||||
* to the final sum value over all vector lanes.
|
||||
* This functionality requires C++11 support.*/
|
||||
template< typename iType, class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::SerialTeamMember >&
|
||||
loop_boundaries, const FunctorType & lambda) {
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
value_type scan_val = value_type();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,scan_val,true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <impl/Kokkos_Serial_Task.hpp>
|
||||
|
||||
|
||||
@ -82,6 +82,15 @@ class Future ;
|
||||
template< typename Space >
|
||||
class TaskScheduler ;
|
||||
|
||||
template< typename Space >
|
||||
void wait( TaskScheduler< Space > const & );
|
||||
|
||||
template< typename Space >
|
||||
struct is_scheduler : public std::false_type {};
|
||||
|
||||
template< typename Space >
|
||||
struct is_scheduler< TaskScheduler< Space > > : public std::true_type {};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#include <impl/Kokkos_TaskQueue.hpp>
|
||||
@ -109,9 +118,6 @@ namespace Impl {
|
||||
template< typename Space , typename ResultType , typename FunctorType >
|
||||
class TaskBase ;
|
||||
|
||||
template< typename Space >
|
||||
class TaskExec ;
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -312,6 +318,19 @@ public:
|
||||
}
|
||||
};
|
||||
|
||||
// Is a Future with the given execution space
|
||||
template< typename , typename ExecSpace = void >
|
||||
struct is_future : public std::false_type {};
|
||||
|
||||
template< typename Arg1 , typename Arg2 , typename ExecSpace >
|
||||
struct is_future< Future<Arg1,Arg2> , ExecSpace >
|
||||
: public std::integral_constant
|
||||
< bool ,
|
||||
( std::is_same< ExecSpace , void >::value ||
|
||||
std::is_same< ExecSpace
|
||||
, typename Future<Arg1,Arg2>::execution_space >::value )
|
||||
> {};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -319,18 +338,59 @@ public:
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam
|
||||
, TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
|
||||
|
||||
enum TaskPriority { TaskHighPriority = 0
|
||||
, TaskRegularPriority = 1
|
||||
, TaskLowPriority = 2 };
|
||||
|
||||
template< typename Space >
|
||||
void wait( TaskScheduler< Space > const & );
|
||||
enum class TaskPriority : int { High = 0
|
||||
, Regular = 1
|
||||
, Low = 2 };
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< int TaskEnum , typename DepFutureType >
|
||||
struct TaskPolicyData
|
||||
{
|
||||
using execution_space = typename DepFutureType::execution_space ;
|
||||
using scheduler_type = TaskScheduler< execution_space > ;
|
||||
|
||||
enum : int { m_task_type = TaskEnum };
|
||||
|
||||
scheduler_type const * m_scheduler ;
|
||||
DepFutureType const m_dependence ;
|
||||
int m_priority ;
|
||||
|
||||
TaskPolicyData() = delete ;
|
||||
TaskPolicyData( TaskPolicyData && ) = default ;
|
||||
TaskPolicyData( TaskPolicyData const & ) = default ;
|
||||
TaskPolicyData & operator = ( TaskPolicyData && ) = default ;
|
||||
TaskPolicyData & operator = ( TaskPolicyData const & ) = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( DepFutureType && arg_future
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( 0 )
|
||||
, m_dependence( arg_future )
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskPolicyData( scheduler_type const & arg_scheduler
|
||||
, Kokkos::TaskPriority const & arg_priority )
|
||||
: m_scheduler( & arg_scheduler )
|
||||
, m_dependence()
|
||||
, m_priority( static_cast<int>( arg_priority ) )
|
||||
{}
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
@ -348,52 +408,13 @@ private:
|
||||
queue_type * m_queue ;
|
||||
|
||||
//----------------------------------------
|
||||
// Process optional arguments to spawn and respawn functions
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const ) {}
|
||||
|
||||
// TaskTeam or TaskSingle
|
||||
template< typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, TaskType const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
task->m_task_type = arg ;
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
// TaskHighPriority or TaskRegularPriority or TaskLowPriority
|
||||
template< typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, TaskPriority const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
task->m_priority = arg ;
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
// Future for a dependence
|
||||
template< typename A1 , typename A2 , typename ... Options >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void assign( task_base * const task
|
||||
, Future< A1 , A2 > const & arg
|
||||
, Options const & ... opts )
|
||||
{
|
||||
task->add_dependence( arg.m_task );
|
||||
assign( task , opts ... );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
using execution_policy = TaskScheduler ;
|
||||
using execution_space = ExecSpace ;
|
||||
using memory_space = typename queue_type::memory_space ;
|
||||
using member_type = Kokkos::Impl::TaskExec< ExecSpace > ;
|
||||
using member_type =
|
||||
typename Kokkos::Impl::TaskQueueSpecialization< ExecSpace >::member_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskScheduler() : m_track(), m_queue(0) {}
|
||||
@ -460,18 +481,13 @@ public:
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
/**\brief A task spawns a task with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
* 3) Team or Serial
|
||||
*/
|
||||
template< typename FunctorType , typename ... Options >
|
||||
KOKKOS_FUNCTION
|
||||
Future< typename FunctorType::value_type , ExecSpace >
|
||||
task_spawn( FunctorType const & arg_functor
|
||||
, Options const & ... arg_options
|
||||
) const
|
||||
template< int TaskEnum , typename DepFutureType , typename FunctorType >
|
||||
KOKKOS_FUNCTION static
|
||||
Kokkos::Future< typename FunctorType::value_type , execution_space >
|
||||
spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
|
||||
, typename task_base::function_type arg_function
|
||||
, FunctorType && arg_functor
|
||||
)
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using future_type = Future< value_type , execution_space > ;
|
||||
@ -479,11 +495,21 @@ public:
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
queue_type * const queue =
|
||||
arg_policy.m_scheduler ? arg_policy.m_scheduler->m_queue : (
|
||||
arg_policy.m_dependence.m_task
|
||||
? arg_policy.m_dependence.m_task->m_queue
|
||||
: (queue_type*) 0 );
|
||||
|
||||
if ( 0 == queue ) {
|
||||
Kokkos::abort("Kokkos spawn given null Future" );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Give single-thread back-ends an opportunity to clear
|
||||
// queue of ready tasks before allocating a new task
|
||||
|
||||
m_queue->iff_single_thread_recursive_execute();
|
||||
queue->iff_single_thread_recursive_execute();
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
@ -491,176 +517,129 @@ public:
|
||||
|
||||
// Allocate task from memory pool
|
||||
f.m_task =
|
||||
reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
|
||||
reinterpret_cast< task_type * >(queue->allocate(sizeof(task_type)));
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Placement new construction
|
||||
new ( f.m_task ) task_type( arg_functor );
|
||||
// Reference count starts at two:
|
||||
// +1 for the matching decrement when task is complete
|
||||
// +1 for the future
|
||||
new ( f.m_task )
|
||||
task_type( arg_function
|
||||
, queue
|
||||
, arg_policy.m_dependence.m_task /* dependence */
|
||||
, 2 /* reference count */
|
||||
, int(sizeof(task_type)) /* allocation size */
|
||||
, int(arg_policy.m_task_type)
|
||||
, int(arg_policy.m_priority)
|
||||
, std::move(arg_functor) );
|
||||
|
||||
// Reference count starts at two
|
||||
// +1 for matching decrement when task is complete
|
||||
// +1 for future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = sizeof(task_type);
|
||||
// The dependence (if any) is processed immediately
|
||||
// within the schedule function, as such the dependence's
|
||||
// reference count does not need to be incremented for
|
||||
// the assignment.
|
||||
|
||||
assign( f.m_task , arg_options... );
|
||||
|
||||
// Spawning from within the execution space so the
|
||||
// apply function pointer is guaranteed to be valid
|
||||
f.m_task->m_apply = task_type::apply ;
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
// this task may be updated or executed at any moment
|
||||
queue->schedule_runnable( f.m_task );
|
||||
// This task may be updated or executed at any moment,
|
||||
// even during the call to 'schedule'.
|
||||
}
|
||||
|
||||
return f ;
|
||||
}
|
||||
|
||||
/**\brief The host process spawns a task with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
* 3) Team or Serial
|
||||
*/
|
||||
template< typename FunctorType , typename ... Options >
|
||||
inline
|
||||
Future< typename FunctorType::value_type , ExecSpace >
|
||||
host_spawn( FunctorType const & arg_functor
|
||||
, Options const & ... arg_options
|
||||
) const
|
||||
template< typename FunctorType , typename A1 , typename A2 >
|
||||
KOKKOS_FUNCTION static
|
||||
void
|
||||
respawn( FunctorType * arg_self
|
||||
, Future<A1,A2> const & arg_dependence
|
||||
, TaskPriority const & arg_priority
|
||||
)
|
||||
{
|
||||
// Precondition: task is in Executing state
|
||||
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using future_type = Future< value_type , execution_space > ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
if ( m_queue == 0 ) {
|
||||
Kokkos::abort("Kokkos::TaskScheduler not initialized");
|
||||
}
|
||||
|
||||
future_type f ;
|
||||
|
||||
// Allocate task from memory pool
|
||||
f.m_task =
|
||||
reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
// Placement new construction
|
||||
new( f.m_task ) task_type( arg_functor );
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = sizeof(task_type);
|
||||
|
||||
assign( f.m_task , arg_options... );
|
||||
|
||||
// Potentially spawning outside execution space so the
|
||||
// apply function pointer must be obtained from execution space.
|
||||
// Required for Cuda execution space function pointer.
|
||||
m_queue->template proc_set_apply< FunctorType >( & f.m_task->m_apply );
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
}
|
||||
return f ;
|
||||
task_type * const task = static_cast< task_type * >( arg_self );
|
||||
|
||||
task->m_priority = static_cast<int>(arg_priority);
|
||||
|
||||
task->add_dependence( arg_dependence.m_task );
|
||||
|
||||
// Postcondition: task is in Executing-Respawn state
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
/**\brief Return a future that is complete
|
||||
* when all input futures are complete.
|
||||
*/
|
||||
template< typename A1 , typename A2 >
|
||||
KOKKOS_FUNCTION
|
||||
Future< ExecSpace >
|
||||
when_all( int narg , Future< A1 , A2 > const * const arg ) const
|
||||
KOKKOS_FUNCTION static
|
||||
Future< execution_space >
|
||||
when_all( Future< A1 , A2 > const arg[] , int narg )
|
||||
{
|
||||
static_assert
|
||||
( std::is_same< execution_space
|
||||
, typename Future< A1 , A2 >::execution_space
|
||||
>::value
|
||||
, "Future must have same execution space" );
|
||||
|
||||
using future_type = Future< ExecSpace > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
|
||||
using future_type = Future< execution_space > ;
|
||||
using task_base = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
|
||||
future_type f ;
|
||||
|
||||
if ( narg ) {
|
||||
|
||||
queue_type * queue = 0 ;
|
||||
|
||||
for ( int i = 0 ; i < narg ; ++i ) {
|
||||
task_base * const t = arg[i].m_task ;
|
||||
if ( 0 != t ) {
|
||||
// Increment reference count to track subsequent assignment.
|
||||
Kokkos::atomic_increment( &(t->m_ref_count) );
|
||||
if ( queue == 0 ) {
|
||||
queue = t->m_queue ;
|
||||
}
|
||||
else if ( queue != t->m_queue ) {
|
||||
Kokkos::abort("Kokkos when_all Futures must be in the same scheduler" );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( queue != 0 ) {
|
||||
|
||||
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
|
||||
|
||||
f.m_task =
|
||||
reinterpret_cast< task_base * >( m_queue->allocate( size ) );
|
||||
reinterpret_cast< task_base * >( queue->allocate( size ) );
|
||||
|
||||
if ( f.m_task ) {
|
||||
|
||||
new( f.m_task ) task_base();
|
||||
|
||||
// Reference count starts at two:
|
||||
// +1 to match decrement when task completes
|
||||
// +1 for the future
|
||||
f.m_task->m_queue = m_queue ;
|
||||
f.m_task->m_ref_count = 2 ;
|
||||
f.m_task->m_alloc_size = size ;
|
||||
f.m_task->m_dep_count = narg ;
|
||||
f.m_task->m_task_type = task_base::Aggregate ;
|
||||
new( f.m_task ) task_base( queue
|
||||
, 2 /* reference count */
|
||||
, size /* allocation size */
|
||||
, narg /* dependence count */
|
||||
);
|
||||
|
||||
// Assign dependences, reference counts were already incremented
|
||||
|
||||
task_base ** const dep = f.m_task->aggregate_dependences();
|
||||
|
||||
// Assign dependences to increment their reference count
|
||||
// The futures may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
for ( int i = 0 ; i < narg ; ++i ) { dep[i] = arg[i].m_task ; }
|
||||
|
||||
for ( int i = 0 ; i < narg ; ++i ) {
|
||||
task_base * const t = dep[i] = arg[i].m_task ;
|
||||
if ( 0 != t ) {
|
||||
Kokkos::atomic_increment( &(t->m_ref_count) );
|
||||
}
|
||||
}
|
||||
|
||||
m_queue->schedule( f.m_task );
|
||||
queue->schedule_aggregate( f.m_task );
|
||||
// this when_all may be processed at any moment
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return f ;
|
||||
}
|
||||
|
||||
/**\brief An executing task respawns itself with options
|
||||
*
|
||||
* 1) High, Normal, or Low priority
|
||||
* 2) With or without dependence
|
||||
*/
|
||||
template< class FunctorType , typename ... Options >
|
||||
KOKKOS_FUNCTION
|
||||
void respawn( FunctorType * task_self
|
||||
, Options const & ... arg_options ) const
|
||||
{
|
||||
using value_type = typename FunctorType::value_type ;
|
||||
using task_type = Impl::TaskBase< execution_space
|
||||
, value_type
|
||||
, FunctorType > ;
|
||||
|
||||
task_type * const task = static_cast< task_type * >( task_self );
|
||||
|
||||
// Reschedule task with no dependences.
|
||||
m_queue->reschedule( task );
|
||||
|
||||
// Dependences, if requested, are added here through parsing the arguments.
|
||||
assign( task , arg_options... );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< typename S >
|
||||
friend
|
||||
void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int allocation_capacity() const noexcept
|
||||
{ return m_queue->m_memory.get_mem_size(); }
|
||||
|
||||
@ -676,12 +655,192 @@ public:
|
||||
long allocated_task_count_accum() const noexcept
|
||||
{ return m_queue->m_accum_alloc ; }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< typename S >
|
||||
friend
|
||||
void Kokkos::wait( Kokkos::TaskScheduler< S > const & );
|
||||
|
||||
};
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Construct a TaskTeam execution policy
|
||||
|
||||
template< typename T >
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
|
||||
, typename std::conditional< Kokkos::is_future< T >::value , T ,
|
||||
typename Kokkos::Future< typename T::execution_space > >::type
|
||||
>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskTeam( T const & arg
|
||||
, TaskPriority const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
static_assert( Kokkos::is_future<T>::value ||
|
||||
Kokkos::is_scheduler<T>::value
|
||||
, "Kokkos TaskTeam argument must be Future or TaskScheduler" );
|
||||
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskTeam
|
||||
, typename std::conditional< Kokkos::is_future< T >::value , T ,
|
||||
typename Kokkos::Future< typename T::execution_space > >::type
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
// Construct a TaskSingle execution policy
|
||||
|
||||
template< typename T >
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
|
||||
, typename std::conditional< Kokkos::is_future< T >::value , T ,
|
||||
typename Kokkos::Future< typename T::execution_space > >::type
|
||||
>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskSingle( T const & arg
|
||||
, TaskPriority const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
static_assert( Kokkos::is_future<T>::value ||
|
||||
Kokkos::is_scheduler<T>::value
|
||||
, "Kokkos TaskSingle argument must be Future or TaskScheduler" );
|
||||
|
||||
return
|
||||
Kokkos::Impl::TaskPolicyData
|
||||
< Kokkos::Impl::TaskBase<void,void,void>::TaskSingle
|
||||
, typename std::conditional< Kokkos::is_future< T >::value , T ,
|
||||
typename Kokkos::Future< typename T::execution_space > >::type
|
||||
>( arg , arg_priority );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
/**\brief A host control thread spawns a task with options
|
||||
*
|
||||
* 1) Team or Serial
|
||||
* 2) With scheduler or dependence
|
||||
* 3) High, Normal, or Low priority
|
||||
*/
|
||||
template< int TaskEnum
|
||||
, typename DepFutureType
|
||||
, typename FunctorType >
|
||||
Future< typename FunctorType::value_type
|
||||
, typename DepFutureType::execution_space >
|
||||
host_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
|
||||
, FunctorType && arg_functor
|
||||
)
|
||||
{
|
||||
using exec_space = typename DepFutureType::execution_space ;
|
||||
using scheduler = TaskScheduler< exec_space > ;
|
||||
|
||||
typedef Impl::TaskBase< exec_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> task_type ;
|
||||
|
||||
static_assert( TaskEnum == task_type::TaskTeam ||
|
||||
TaskEnum == task_type::TaskSingle
|
||||
, "Kokkos host_spawn requires TaskTeam or TaskSingle" );
|
||||
|
||||
// May be spawning a Cuda task, must use the specialization
|
||||
// to query on-device function pointer.
|
||||
typename task_type::function_type const ptr =
|
||||
Kokkos::Impl::TaskQueueSpecialization< exec_space >::
|
||||
template get_function_pointer< task_type >();
|
||||
|
||||
return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
|
||||
}
|
||||
|
||||
/**\brief A task spawns a task with options
|
||||
*
|
||||
* 1) Team or Serial
|
||||
* 2) With scheduler or dependence
|
||||
* 3) High, Normal, or Low priority
|
||||
*/
|
||||
template< int TaskEnum
|
||||
, typename DepFutureType
|
||||
, typename FunctorType >
|
||||
Future< typename FunctorType::value_type
|
||||
, typename DepFutureType::execution_space >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
task_spawn( Impl::TaskPolicyData<TaskEnum,DepFutureType> const & arg_policy
|
||||
, FunctorType && arg_functor
|
||||
)
|
||||
{
|
||||
using exec_space = typename DepFutureType::execution_space ;
|
||||
using scheduler = TaskScheduler< exec_space > ;
|
||||
|
||||
typedef Impl::TaskBase< exec_space
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> task_type ;
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) && \
|
||||
defined( KOKKOS_ENABLE_CUDA )
|
||||
|
||||
static_assert( ! std::is_same< Kokkos::Cuda , exec_space >::value
|
||||
, "Error calling Kokkos::task_spawn for Cuda space within Host code" );
|
||||
|
||||
#endif
|
||||
|
||||
static_assert( TaskEnum == task_type::TaskTeam ||
|
||||
TaskEnum == task_type::TaskSingle
|
||||
, "Kokkos host_spawn requires TaskTeam or TaskSingle" );
|
||||
|
||||
typename task_type::function_type const ptr = task_type::apply ;
|
||||
|
||||
return scheduler::spawn( arg_policy , ptr , std::move(arg_functor) );
|
||||
}
|
||||
|
||||
/**\brief A task respawns itself with options
|
||||
*
|
||||
* 1) With scheduler or dependence
|
||||
* 2) High, Normal, or Low priority
|
||||
*/
|
||||
template< typename FunctorType , typename T >
|
||||
void
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
respawn( FunctorType * arg_self
|
||||
, T const & arg
|
||||
, TaskPriority const & arg_priority = TaskPriority::Regular
|
||||
)
|
||||
{
|
||||
static_assert( Kokkos::is_future<T>::value ||
|
||||
Kokkos::is_scheduler<T>::value
|
||||
, "Kokkos respawn argument must be Future or TaskScheduler" );
|
||||
|
||||
TaskScheduler< typename T::execution_space >::
|
||||
respawn( arg_self , arg , arg_priority );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename A1 , typename A2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Future< typename Future< A1 , A2 >::execution_space >
|
||||
when_all( Future< A1 , A2 > const arg[]
|
||||
, int narg
|
||||
)
|
||||
{
|
||||
return TaskScheduler< typename Future<A1,A2>::execution_space >::
|
||||
when_all( arg , narg );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Wait for all runnable tasks to complete
|
||||
|
||||
template< typename ExecSpace >
|
||||
inline
|
||||
void wait( TaskScheduler< ExecSpace > const & policy )
|
||||
{ policy.m_queue->execute(); }
|
||||
void wait( TaskScheduler< ExecSpace > const & scheduler )
|
||||
{ scheduler.m_queue->execute(); }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
@ -230,4 +230,3 @@ struct VerifyExecutionCanAccessMemorySpace
|
||||
#endif /* #if defined( KOKKOS_ENABLE_PTHREAD ) */
|
||||
#endif /* #define KOKKOS_THREADS_HPP */
|
||||
|
||||
|
||||
|
||||
@ -40,9 +40,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1)
|
||||
CONDITIONAL_COPIES += copy-threads
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
KOKKOS_HEADERS_QTHREAD += $(wildcard $(KOKKOS_PATH)/core/src/Qthread/*.hpp)
|
||||
CONDITIONAL_COPIES += copy-qthread
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp)
|
||||
CONDITIONAL_COPIES += copy-qthreads
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
|
||||
@ -60,6 +60,12 @@ ifeq ($(KOKKOS_OS),Darwin)
|
||||
COPY_FLAG =
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_DEBUG),"no")
|
||||
KOKKOS_DEBUG_CMAKE = OFF
|
||||
else
|
||||
KOKKOS_DEBUG_CMAKE = ON
|
||||
endif
|
||||
|
||||
messages:
|
||||
echo "Start Build"
|
||||
|
||||
@ -91,6 +97,7 @@ build-makefile-kokkos:
|
||||
echo "" >> Makefile.kokkos
|
||||
echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos
|
||||
echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos
|
||||
echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos
|
||||
echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos
|
||||
echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos
|
||||
echo "" >> Makefile.kokkos
|
||||
@ -107,7 +114,55 @@ build-makefile-kokkos:
|
||||
> Makefile.kokkos.tmp
|
||||
mv -f Makefile.kokkos.tmp Makefile.kokkos
|
||||
|
||||
build-lib: build-makefile-kokkos $(KOKKOS_LINK_DEPENDS)
|
||||
build-cmake-kokkos:
|
||||
rm -f kokkos.cmake
|
||||
echo "#Global Settings used to generate this library" >> kokkos.cmake
|
||||
echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake
|
||||
echo "if(NOT $ENV{CXX})" >> kokkos.cmake
|
||||
echo ' message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake
|
||||
echo "endif()" >> kokkos.cmake
|
||||
echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake
|
||||
echo " set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake
|
||||
echo "else()" >> kokkos.cmake
|
||||
echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake
|
||||
echo "endif()" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake
|
||||
echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Variables used in application Makefiles" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake
|
||||
echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake
|
||||
echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake
|
||||
echo "" >> kokkos.cmake
|
||||
sed \
|
||||
-e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \
|
||||
-e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \
|
||||
-e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \
|
||||
-e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \
|
||||
-e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \
|
||||
-e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \
|
||||
> kokkos.cmake.tmp
|
||||
mv -f kokkos.cmake.tmp kokkos.cmake
|
||||
|
||||
build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS)
|
||||
|
||||
mkdir:
|
||||
mkdir -p $(PREFIX)
|
||||
@ -124,9 +179,9 @@ copy-threads: mkdir
|
||||
mkdir -p $(PREFIX)/include/Threads
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_THREADS) $(PREFIX)/include/Threads
|
||||
|
||||
copy-qthread: mkdir
|
||||
mkdir -p $(PREFIX)/include/Qthread
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREAD) $(PREFIX)/include/Qthread
|
||||
copy-qthreads: mkdir
|
||||
mkdir -p $(PREFIX)/include/Qthreads
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_QTHREADS) $(PREFIX)/include/Qthreads
|
||||
|
||||
copy-openmp: mkdir
|
||||
mkdir -p $(PREFIX)/include/OpenMP
|
||||
@ -137,6 +192,7 @@ install: mkdir $(CONDITIONAL_COPIES) build-lib
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include
|
||||
cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl
|
||||
cp $(COPY_FLAG) Makefile.kokkos $(PREFIX)
|
||||
cp $(COPY_FLAG) kokkos.cmake $(PREFIX)
|
||||
cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib
|
||||
cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include
|
||||
|
||||
|
||||
@ -46,7 +46,6 @@
|
||||
|
||||
#include <omp.h>
|
||||
#include <iostream>
|
||||
#include <Kokkos_Parallel.hpp>
|
||||
#include <OpenMP/Kokkos_OpenMPexec.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
@ -107,58 +106,41 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
this->template execute_schedule<typename Policy::schedule_type::type>();
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
inline
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
|
||||
execute_schedule() const
|
||||
inline void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
||||
ParallelFor::template exec_range< WorkTag >( m_functor , range.begin() , range.end() );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
if ( is_dynamic ) {
|
||||
// Make sure work partition is set before stealing
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
inline
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
|
||||
execute_schedule() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
do {
|
||||
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
#pragma omp barrier
|
||||
ParallelFor::template
|
||||
exec_range< WorkTag >( m_functor
|
||||
, range.first + m_policy.begin()
|
||||
, range.second + m_policy.begin() );
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
|
||||
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
|
||||
ParallelFor::template exec_range< WorkTag >( m_functor , begin, end );
|
||||
work_index = exec.get_work_index();
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
// END #pragma omp parallel
|
||||
}
|
||||
|
||||
inline
|
||||
@ -193,17 +175,18 @@ private:
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
// Static Assert WorkTag void if ReducerType not InvalidType
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -247,92 +230,70 @@ private:
|
||||
|
||||
public:
|
||||
|
||||
inline void execute() const {
|
||||
this->template execute_schedule<typename Policy::schedule_type::type>();
|
||||
}
|
||||
|
||||
template<class Schedule>
|
||||
inline
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Static>::value >::type
|
||||
execute_schedule() const
|
||||
inline void execute() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
enum { is_dynamic = std::is_same< typename Policy::schedule_type::type
|
||||
, Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t pool_reduce_bytes =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
|
||||
// Reduction:
|
||||
data.set_work_partition( m_policy.end() - m_policy.begin()
|
||||
, m_policy.chunk_size() );
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
if ( is_dynamic ) {
|
||||
// Make sure work partition is set before stealing
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
|
||||
, data.pool_reduce_local() );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
do {
|
||||
|
||||
template<class Schedule>
|
||||
inline
|
||||
typename std::enable_if< std::is_same<Schedule,Kokkos::Dynamic>::value >::type
|
||||
execute_schedule() const
|
||||
{
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
|
||||
exec.set_work_range(range.begin(),range.end(),m_policy.chunk_size());
|
||||
exec.reset_steal_target();
|
||||
#pragma omp barrier
|
||||
|
||||
long work_index = exec.get_work_index();
|
||||
|
||||
reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
|
||||
while(work_index != -1) {
|
||||
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
|
||||
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
|
||||
ParallelReduce::template exec_range< WorkTag >
|
||||
( m_functor , begin,end
|
||||
ParallelReduce::template
|
||||
exec_range< WorkTag >( m_functor
|
||||
, range.first + m_policy.begin()
|
||||
, range.second + m_policy.begin()
|
||||
, update );
|
||||
work_index = exec.get_work_index();
|
||||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
// END #pragma omp parallel
|
||||
|
||||
// Reduction:
|
||||
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
@ -394,17 +355,18 @@ private:
|
||||
|
||||
typedef Kokkos::RangePolicy< Traits ... > Policy ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::SCAN , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::WorkRange WorkRange ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType, WorkTag > ValueOps ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -452,53 +414,63 @@ public:
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_scan");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_scan");
|
||||
|
||||
OpenMPexec::resize_scratch( 2 * ValueTraits::value_size( m_functor ) , 0 );
|
||||
const int value_count = Analysis::value_count( m_functor );
|
||||
const size_t pool_reduce_bytes = 2 * Analysis::value_size( m_functor );
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
, 0 // team_reduce_bytes
|
||||
, 0 // team_shared_bytes
|
||||
, 0 // thread_local_bytes
|
||||
);
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr =
|
||||
pointer_type( exec.scratch_reduce() ) +
|
||||
ValueTraits::value_count( m_functor );
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueInit::init( m_functor , ptr ) , false );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
|
||||
{
|
||||
const unsigned thread_count = OpenMPexec::pool_size();
|
||||
const unsigned value_count = ValueTraits::value_count( m_functor );
|
||||
const WorkRange range( m_policy, data.pool_rank(), data.pool_size() );
|
||||
|
||||
reference_type update_sum =
|
||||
ValueInit::init( m_functor , data.pool_reduce_local() );
|
||||
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end() , update_sum , false );
|
||||
|
||||
if ( data.pool_rendezvous() ) {
|
||||
|
||||
pointer_type ptr_prev = 0 ;
|
||||
|
||||
for ( unsigned rank_rev = thread_count ; rank_rev-- ; ) {
|
||||
const int n = data.pool_size();
|
||||
|
||||
pointer_type ptr = pointer_type( OpenMPexec::pool_rev(rank_rev)->scratch_reduce() );
|
||||
for ( int i = 0 ; i < n ; ++i ) {
|
||||
|
||||
if ( ptr_prev ) {
|
||||
for ( unsigned i = 0 ; i < value_count ; ++i ) { ptr[i] = ptr_prev[ i + value_count ] ; }
|
||||
ValueJoin::join( m_functor , ptr + value_count , ptr );
|
||||
pointer_type ptr = (pointer_type)
|
||||
data.pool_member(i)->pool_reduce_local();
|
||||
|
||||
if ( i ) {
|
||||
for ( int j = 0 ; j < value_count ; ++j ) {
|
||||
ptr[j+value_count] = ptr_prev[j+value_count] ;
|
||||
}
|
||||
ValueJoin::join( m_functor , ptr + value_count , ptr_prev );
|
||||
}
|
||||
else {
|
||||
ValueInit::init( m_functor , ptr );
|
||||
ValueInit::init( m_functor , ptr + value_count );
|
||||
}
|
||||
|
||||
ptr_prev = ptr ;
|
||||
}
|
||||
|
||||
data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
|
||||
const pointer_type ptr = pointer_type( exec.scratch_reduce() );
|
||||
reference_type update_base =
|
||||
ValueOps::reference
|
||||
( ((pointer_type)data.pool_reduce_local()) + value_count );
|
||||
|
||||
ParallelScan::template exec_range< WorkTag >
|
||||
( m_functor , range.begin() , range.end()
|
||||
, ValueOps::reference( ptr ) , true );
|
||||
( m_functor , range.begin() , range.end() , update_base , true );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
@ -530,55 +502,59 @@ class ParallelFor< FunctorType
|
||||
{
|
||||
private:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::schedule_type::type SchedTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
const int m_shmem_size ;
|
||||
|
||||
template< class TagType, class Schedule >
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Static>::value>::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
|
||||
exec_team( const FunctorType & functor
|
||||
, HostThreadTeamData & data
|
||||
, const int league_rank_begin
|
||||
, const int league_rank_end
|
||||
, const int league_size )
|
||||
{
|
||||
for ( ; member.valid_static() ; member.next_static() ) {
|
||||
functor( member );
|
||||
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
|
||||
|
||||
functor( Member( data, r , league_size ) );
|
||||
|
||||
if ( ++r < league_rank_end ) {
|
||||
// Don't allow team members to lap one another
|
||||
// so that they don't overwrite shared memory.
|
||||
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType, class Schedule >
|
||||
inline static
|
||||
typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Static>::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid_static() ; member.next_static() ) {
|
||||
functor( t , member );
|
||||
}
|
||||
}
|
||||
|
||||
template< class TagType, class Schedule >
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value && std::is_same<Schedule,Kokkos::Dynamic>::value>::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
|
||||
exec_team( const FunctorType & functor
|
||||
, HostThreadTeamData & data
|
||||
, const int league_rank_begin
|
||||
, const int league_rank_end
|
||||
, const int league_size )
|
||||
{
|
||||
#pragma omp barrier
|
||||
for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
|
||||
functor( member );
|
||||
}
|
||||
}
|
||||
const TagType t{};
|
||||
|
||||
template< class TagType, class Schedule >
|
||||
inline static
|
||||
typename std::enable_if< (! std::is_same< TagType , void >::value) && std::is_same<Schedule,Kokkos::Dynamic>::value >::type
|
||||
exec_team( const FunctorType & functor , Member member )
|
||||
{
|
||||
#pragma omp barrier
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid_dynamic() ; member.next_dynamic() ) {
|
||||
functor( t , member );
|
||||
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
|
||||
|
||||
functor( t , Member( data, r , league_size ) );
|
||||
|
||||
if ( ++r < league_rank_end ) {
|
||||
// Don't allow team members to lap one another
|
||||
// so that they don't overwrite shared memory.
|
||||
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -587,31 +563,75 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_for");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_for");
|
||||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
const size_t pool_reduce_size = 0 ; // Never shrinks
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
|
||||
( m_functor
|
||||
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
if ( active ) {
|
||||
data.set_work_partition( m_policy.league_size()
|
||||
, ( 0 < m_policy.chunk_size()
|
||||
? m_policy.chunk_size()
|
||||
: m_policy.team_iter() ) );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
if ( is_dynamic ) {
|
||||
// Must synchronize to make sure each team has set its
|
||||
// partition before begining the work stealing loop.
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
if ( active ) {
|
||||
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
do {
|
||||
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
ParallelFor::template exec_team< WorkTag >
|
||||
( m_functor , data
|
||||
, range.first , range.second , m_policy.league_size() );
|
||||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
}
|
||||
|
||||
data.disband_team();
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
}
|
||||
|
||||
|
||||
inline
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
const Policy & arg_policy )
|
||||
: m_functor( arg_functor )
|
||||
, m_policy( arg_policy )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >
|
||||
::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< class FunctorType , class ReducerType, class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
@ -622,20 +642,26 @@ class ParallelReduce< FunctorType
|
||||
{
|
||||
private:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::OpenMP, Properties ... > Policy ;
|
||||
|
||||
typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::schedule_type::type SchedTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value
|
||||
, FunctorType, ReducerType> ReducerConditional;
|
||||
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
|
||||
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
|
||||
|
||||
typedef typename ValueTraits::pointer_type pointer_type ;
|
||||
typedef typename ValueTraits::reference_type reference_type ;
|
||||
typedef typename Analysis::pointer_type pointer_type ;
|
||||
typedef typename Analysis::reference_type reference_type ;
|
||||
|
||||
const FunctorType m_functor ;
|
||||
const Policy m_policy ;
|
||||
@ -645,22 +671,48 @@ private:
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
typename std::enable_if< ( std::is_same< TagType , void >::value ) >::type
|
||||
exec_team( const FunctorType & functor
|
||||
, HostThreadTeamData & data
|
||||
, reference_type & update
|
||||
, const int league_rank_begin
|
||||
, const int league_rank_end
|
||||
, const int league_size )
|
||||
{
|
||||
for ( ; member.valid_static() ; member.next_static() ) {
|
||||
functor( member , update );
|
||||
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
|
||||
|
||||
functor( Member( data, r , league_size ) , update );
|
||||
|
||||
if ( ++r < league_rank_end ) {
|
||||
// Don't allow team members to lap one another
|
||||
// so that they don't overwrite shared memory.
|
||||
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
template< class TagType >
|
||||
inline static
|
||||
typename std::enable_if< ! std::is_same< TagType , void >::value >::type
|
||||
exec_team( const FunctorType & functor , Member member , reference_type update )
|
||||
typename std::enable_if< ( ! std::is_same< TagType , void >::value ) >::type
|
||||
exec_team( const FunctorType & functor
|
||||
, HostThreadTeamData & data
|
||||
, reference_type & update
|
||||
, const int league_rank_begin
|
||||
, const int league_rank_end
|
||||
, const int league_size )
|
||||
{
|
||||
const TagType t{} ;
|
||||
for ( ; member.valid_static() ; member.next_static() ) {
|
||||
functor( t , member , update );
|
||||
const TagType t{};
|
||||
|
||||
for ( int r = league_rank_begin ; r < league_rank_end ; ) {
|
||||
|
||||
functor( t , Member( data, r , league_size ) , update );
|
||||
|
||||
if ( ++r < league_rank_end ) {
|
||||
// Don't allow team members to lap one another
|
||||
// so that they don't overwrite shared memory.
|
||||
if ( data.team_rendezvous() ) { data.team_rendezvous_release(); }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -669,43 +721,88 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
enum { is_dynamic = std::is_same< SchedTag , Kokkos::Dynamic >::value };
|
||||
|
||||
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
|
||||
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
|
||||
|
||||
const size_t team_reduce_size = Policy::member_type::team_reduce_size();
|
||||
const size_t pool_reduce_size =
|
||||
Analysis::value_size( ReducerConditional::select(m_functor, m_reducer));
|
||||
|
||||
OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
|
||||
const size_t team_reduce_size = TEAM_REDUCE_SIZE * m_policy.team_size();
|
||||
const size_t team_shared_size = m_shmem_size + m_policy.scratch_size(1);
|
||||
const size_t thread_local_size = 0 ; // Never shrinks
|
||||
|
||||
OpenMPexec::resize_thread_data( pool_reduce_size
|
||||
, team_reduce_size
|
||||
, team_shared_size
|
||||
, thread_local_size );
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
OpenMPexec & exec = * OpenMPexec::get_thread_omp();
|
||||
HostThreadTeamData & data = *OpenMPexec::get_thread_data();
|
||||
|
||||
const int active = data.organize_team( m_policy.team_size() );
|
||||
|
||||
if ( active ) {
|
||||
data.set_work_partition( m_policy.league_size()
|
||||
, ( 0 < m_policy.chunk_size()
|
||||
? m_policy.chunk_size()
|
||||
: m_policy.team_iter() ) );
|
||||
}
|
||||
|
||||
if ( is_dynamic ) {
|
||||
// Must synchronize to make sure each team has set its
|
||||
// partition before begining the work stealing loop.
|
||||
if ( data.pool_rendezvous() ) data.pool_rendezvous_release();
|
||||
}
|
||||
|
||||
if ( active ) {
|
||||
reference_type update =
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
|
||||
, data.pool_reduce_local() );
|
||||
|
||||
std::pair<int64_t,int64_t> range(0,0);
|
||||
|
||||
do {
|
||||
|
||||
range = is_dynamic ? data.get_work_stealing_chunk()
|
||||
: data.get_work_partition();
|
||||
|
||||
ParallelReduce::template exec_team< WorkTag >
|
||||
( m_functor
|
||||
, Member( exec , m_policy , m_shmem_size, 0 )
|
||||
, ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
|
||||
( m_functor , data , update
|
||||
, range.first , range.second , m_policy.league_size() );
|
||||
|
||||
} while ( is_dynamic && 0 <= range.first );
|
||||
} else {
|
||||
ValueInit::init( ReducerConditional::select(m_functor , m_reducer)
|
||||
, data.pool_reduce_local() );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
{
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
|
||||
data.disband_team();
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
int max_active_threads = OpenMPexec::pool_size();
|
||||
if( max_active_threads > m_policy.league_size()* m_policy.team_size() )
|
||||
max_active_threads = m_policy.league_size()* m_policy.team_size();
|
||||
// Reduction:
|
||||
|
||||
for ( int i = 1 ; i < max_active_threads ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
|
||||
const pointer_type ptr = pointer_type( OpenMPexec::get_thread_data(0)->pool_reduce_local() );
|
||||
|
||||
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
|
||||
ValueJoin::join( ReducerConditional::select(m_functor , m_reducer)
|
||||
, ptr
|
||||
, OpenMPexec::get_thread_data(i)->pool_reduce_local() );
|
||||
}
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
|
||||
|
||||
if ( m_result_ptr ) {
|
||||
const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) );
|
||||
|
||||
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class ViewType >
|
||||
inline
|
||||
@ -720,7 +817,10 @@ public:
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( InvalidType() )
|
||||
, m_result_ptr( arg_result.ptr_on_device() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >
|
||||
::value( arg_functor , arg_policy.team_size() ) )
|
||||
{}
|
||||
|
||||
inline
|
||||
@ -731,7 +831,10 @@ public:
|
||||
, m_policy( arg_policy )
|
||||
, m_reducer( reducer )
|
||||
, m_result_ptr( reducer.result_view().data() )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
|
||||
, m_shmem_size( arg_policy.scratch_size(0) +
|
||||
arg_policy.scratch_size(1) +
|
||||
FunctorTeamShmemSize< FunctorType >
|
||||
::value( arg_functor , arg_policy.team_size() ) )
|
||||
{
|
||||
/*static_assert( std::is_same< typename ViewType::memory_space
|
||||
, Kokkos::HostSpace >::value
|
||||
|
||||
@ -46,6 +46,7 @@
|
||||
#if defined( KOKKOS_ENABLE_OPENMP ) && defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
@ -55,105 +56,46 @@ namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::OpenMP > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
class HostThreadTeamDataSingleton : private HostThreadTeamData {
|
||||
private:
|
||||
|
||||
TaskExec< Kokkos::OpenMP >::
|
||||
TaskExec()
|
||||
: m_self_exec( 0 )
|
||||
, m_team_exec( 0 )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( 0 )
|
||||
, m_team_rank( 0 )
|
||||
, m_team_size( 1 )
|
||||
{
|
||||
}
|
||||
HostThreadTeamDataSingleton() : HostThreadTeamData()
|
||||
{
|
||||
Kokkos::OpenMP::memory_space space ;
|
||||
const size_t num_pool_reduce_bytes = 32 ;
|
||||
const size_t num_team_reduce_bytes = 32 ;
|
||||
const size_t num_team_shared_bytes = 1024 ;
|
||||
const size_t num_thread_local_bytes = 1024 ;
|
||||
const size_t alloc_bytes =
|
||||
HostThreadTeamData::scratch_size( num_pool_reduce_bytes
|
||||
, num_team_reduce_bytes
|
||||
, num_team_shared_bytes
|
||||
, num_thread_local_bytes );
|
||||
|
||||
TaskExec< Kokkos::OpenMP >::
|
||||
TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
|
||||
: m_self_exec( & arg_exec )
|
||||
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
|
||||
, m_sync_mask( 0 )
|
||||
, m_sync_value( 0 )
|
||||
, m_sync_step( 0 )
|
||||
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
|
||||
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
|
||||
, m_team_size( arg_team_size )
|
||||
{
|
||||
// This team spans
|
||||
// m_self_exec->pool_rev( team_size * group_rank )
|
||||
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
|
||||
|
||||
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
|
||||
|
||||
sync[0] = int64_t(0) ;
|
||||
sync[1] = int64_t(0) ;
|
||||
|
||||
for ( int i = 0 ; i < m_team_size ; ++i ) {
|
||||
m_sync_value |= int64_t(1) << (8*i);
|
||||
m_sync_mask |= int64_t(3) << (8*i);
|
||||
HostThreadTeamData::scratch_assign
|
||||
( space.allocate( alloc_bytes )
|
||||
, alloc_bytes
|
||||
, num_pool_reduce_bytes
|
||||
, num_team_reduce_bytes
|
||||
, num_team_shared_bytes
|
||||
, num_thread_local_bytes );
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
|
||||
{
|
||||
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
|
||||
Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
|
||||
~HostThreadTeamDataSingleton()
|
||||
{
|
||||
Kokkos::OpenMP::memory_space space ;
|
||||
space.deallocate( HostThreadTeamData::scratch_buffer()
|
||||
, HostThreadTeamData::scratch_bytes() );
|
||||
}
|
||||
|
||||
// Use team shared memory to synchronize.
|
||||
// Alternate memory locations between barriers to avoid a sequence
|
||||
// of barriers overtaking one another.
|
||||
public:
|
||||
|
||||
int64_t volatile * const sync =
|
||||
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
|
||||
|
||||
// This team member sets one byte within the sync variable
|
||||
int8_t volatile * const sync_self =
|
||||
((int8_t *) sync) + m_team_rank ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
|
||||
|
||||
while ( m_sync_value != *sync ); // wait for team to arrive
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
|
||||
, m_group_rank
|
||||
, m_team_rank
|
||||
, m_sync_step
|
||||
, m_sync_value
|
||||
, *sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
++m_sync_step ;
|
||||
|
||||
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
|
||||
m_sync_value ^= m_sync_mask ;
|
||||
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
|
||||
static HostThreadTeamData & singleton()
|
||||
{
|
||||
static HostThreadTeamDataSingleton s ;
|
||||
return s ;
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -163,123 +105,165 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::execute
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using PoolExec = Kokkos::Impl::OpenMPexec ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
static task_root_type * const end =
|
||||
(task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
// Required: team_size <= 8
|
||||
HostThreadTeamData & team_data_single =
|
||||
HostThreadTeamDataSingleton::singleton();
|
||||
|
||||
const int team_size = PoolExec::pool_size(2); // Threads per core
|
||||
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
|
||||
const int team_size = Impl::OpenMPexec::pool_size(2); // Threads per core
|
||||
// const int team_size = Impl::OpenMPexec::pool_size(1); // Threads per NUMA
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 8 < team_size ) {
|
||||
Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
PoolExec & self = *PoolExec::get_thread_omp();
|
||||
Impl::HostThreadTeamData & self = *Impl::OpenMPexec::get_thread_data();
|
||||
|
||||
Member single_exec ;
|
||||
Member team_exec( self , team_size );
|
||||
// Organizing threads into a team performs a barrier across the
|
||||
// entire pool to insure proper initialization of the team
|
||||
// rendezvous mechanism before a team rendezvous can be performed.
|
||||
|
||||
// Team shared memory
|
||||
task_root_type * volatile * const task_shared =
|
||||
(task_root_type **) team_exec.m_team_exec->scratch_thread();
|
||||
if ( self.organize_team( team_size ) ) {
|
||||
|
||||
// Barrier across entire OpenMP thread pool to insure initialization
|
||||
#pragma omp barrier
|
||||
Member single_exec( team_data_single );
|
||||
Member team_exec( self );
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) running\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
task_root_type * task = 0 ;
|
||||
|
||||
do {
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or a single thread task for the team.
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
bool leader_loop = false ;
|
||||
|
||||
do {
|
||||
|
||||
if ( 0 != task && end != task ) {
|
||||
// team member #0 completes the previously executed task,
|
||||
// completion may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// If 0 == m_ready_count then set task = 0
|
||||
|
||||
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Attempt to acquire a task
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
// Team lead broadcast acquired task to team members:
|
||||
// If still tasks are still executing
|
||||
// and no task could be acquired
|
||||
// then continue this leader loop
|
||||
leader_loop = end == task ;
|
||||
|
||||
if ( 1 < team_exec.team_size() ) {
|
||||
if ( ( ! leader_loop ) &&
|
||||
( 0 != task ) &&
|
||||
( task_root_type::TaskSingle == task->m_task_type ) ) {
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
|
||||
|
||||
// Fence to be sure task_shared is stored before the barrier
|
||||
Kokkos::memory_fence();
|
||||
|
||||
// Whole team waits for every team member to reach this statement
|
||||
team_exec.team_barrier();
|
||||
|
||||
// Fence to be sure task_shared is stored
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task = *task_shared ;
|
||||
}
|
||||
// if a single thread task then execute now
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
|
||||
, team_exec.m_group_rank
|
||||
, team_exec.m_team_rank
|
||||
, uintptr_t(task_shared)
|
||||
, uintptr_t(task)
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) executing single task 0x%lx\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, int64_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 0 == task ) break ; // 0 == m_ready_count
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
if ( end == task ) {
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
leader_loop = true ;
|
||||
}
|
||||
else if ( task_root_type::TaskTeam == task->m_task_type ) {
|
||||
// Thread Team Task
|
||||
} while ( leader_loop );
|
||||
}
|
||||
|
||||
// Team lead either found 0 == m_ready_count or a team task
|
||||
// Team lead broadcast acquired task:
|
||||
|
||||
team_exec.team_broadcast( task , 0);
|
||||
|
||||
if ( 0 != task ) { // Thread Team Task
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team((%d of %d) league(%d of %d) executing team task 0x%lx\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
, int64_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
(*task->m_apply)( task , & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
// team member #0 completes the task, which may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single Thread Task
|
||||
} while( 0 != task );
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) team(%d of %d) league(%d of %d) ending\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
, team_exec.team_rank()
|
||||
, team_exec.team_size()
|
||||
, team_exec.league_rank()
|
||||
, team_exec.league_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
(*task->m_apply)( task , & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Not necessary to complete the task.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
} while(1);
|
||||
self.disband_team();
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> pool(%d of %d) disbanded\n"
|
||||
, self.pool_rank()
|
||||
, self.pool_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"TaskQueue<OpenMP> execute %d end\n", team_size );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
@ -289,13 +273,16 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec ;
|
||||
HostThreadTeamData & team_data_single =
|
||||
HostThreadTeamDataSingleton::singleton();
|
||||
|
||||
Member single_exec( team_data_single );
|
||||
|
||||
task_root_type * task = end ;
|
||||
|
||||
@ -306,7 +293,7 @@ void TaskQueueSpecialization< Kokkos::OpenMP >::
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -60,6 +60,7 @@ public:
|
||||
using execution_space = Kokkos::OpenMP ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
@ -70,296 +71,19 @@ public:
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
template< typename TaskType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::OpenMP
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
typename TaskType::function_type
|
||||
get_function_pointer() { return TaskType::apply ; }
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::OpenMP > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::OpenMP >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
using PoolExec = Kokkos::Impl::OpenMPexec ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
|
||||
|
||||
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
|
||||
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
|
||||
int64_t m_sync_mask ;
|
||||
int64_t mutable m_sync_value ;
|
||||
int mutable m_sync_step ;
|
||||
int m_group_rank ; ///< Which "team" subset of thread pool
|
||||
int m_team_rank ; ///< Which thread within a team
|
||||
int m_team_size ;
|
||||
|
||||
TaskExec();
|
||||
TaskExec( PoolExec & arg_exec , int arg_team_size );
|
||||
|
||||
void team_barrier_impl() const ;
|
||||
|
||||
public:
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void * team_shared() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
|
||||
|
||||
int team_shared_size() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
|
||||
|
||||
/**\brief Whole team enters this function call
|
||||
* before any teeam member returns from
|
||||
* this function call.
|
||||
*/
|
||||
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_rank() const { return m_team_rank ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_size() const { return m_team_size ; }
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
|
||||
TeamThreadRange
|
||||
( Impl::TaskExec< Kokkos::OpenMP > & thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
|
||||
}
|
||||
|
||||
template<typename iType1, typename iType2>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::TaskExec< Kokkos::OpenMP > >
|
||||
TeamThreadRange
|
||||
( Impl:: TaskExec< Kokkos::OpenMP > & thread, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type<iType1, iType2>::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct<iType, Impl::TaskExec< Kokkos::OpenMP > >(thread, begin, end);
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
|
||||
ThreadVectorRange
|
||||
( Impl::TaskExec< Kokkos::OpenMP > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.
|
||||
*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
)
|
||||
{
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i);
|
||||
}
|
||||
}
|
||||
|
||||
template<typename iType, class Lambda, typename ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
|
||||
, const Lambda& lambda
|
||||
, ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
shared[0] += shared[i];
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i, result);
|
||||
}
|
||||
|
||||
if ( 1 < loop_boundaries.thread.team_size() ) {
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
shared[team_rank] = result;
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// reduce across threads to thread 0
|
||||
if (team_rank == 0) {
|
||||
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
|
||||
join(shared[0], shared[i]);
|
||||
}
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// broadcast result
|
||||
initialized_result = shared[0];
|
||||
}
|
||||
else {
|
||||
initialized_result = result ;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, local_total;
|
||||
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
|
||||
int team_size = loop_boundaries.thread.team_size();
|
||||
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
|
||||
|
||||
// Intra-member scan
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
|
||||
shared[team_rank] = accum;
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Member 0 do scan on accumulated totals
|
||||
if (team_rank == 0) {
|
||||
for( iType i = 1; i < team_size; i+=1) {
|
||||
shared[i] += shared[i-1];
|
||||
}
|
||||
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
|
||||
}
|
||||
|
||||
loop_boundaries.thread.team_barrier();
|
||||
|
||||
// Inter-member scan adding in accumulated totals
|
||||
if (team_rank != 0) { accum = shared[team_rank-1]; }
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */
|
||||
|
||||
|
||||
@ -86,7 +86,7 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
|
||||
|
||||
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
HostThreadTeamData * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
|
||||
|
||||
void OpenMPexec::verify_is_process( const char * const label )
|
||||
{
|
||||
@ -113,67 +113,110 @@ void OpenMPexec::verify_initialized( const char * const label )
|
||||
|
||||
}
|
||||
|
||||
void OpenMPexec::clear_scratch()
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void OpenMPexec::clear_thread_data()
|
||||
{
|
||||
const size_t member_bytes =
|
||||
sizeof(int64_t) *
|
||||
HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
|
||||
|
||||
const int old_alloc_bytes =
|
||||
m_pool[0] ? ( member_bytes + m_pool[0]->scratch_bytes() ) : 0 ;
|
||||
|
||||
Kokkos::HostSpace space ;
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
if ( m_pool[ rank_rev ] ) {
|
||||
Record * const r = Record::get_record( m_pool[ rank_rev ] );
|
||||
m_pool[ rank_rev ] = 0 ;
|
||||
Record::decrement( r );
|
||||
const int rank = m_map_rank[ omp_get_thread_num() ];
|
||||
|
||||
if ( 0 != m_pool[rank] ) {
|
||||
|
||||
m_pool[rank]->disband_pool();
|
||||
|
||||
space.deallocate( m_pool[rank] , old_alloc_bytes );
|
||||
|
||||
m_pool[rank] = 0 ;
|
||||
}
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
}
|
||||
|
||||
void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
|
||||
void OpenMPexec::resize_thread_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes )
|
||||
{
|
||||
enum { ALIGN_MASK = Kokkos::Impl::MEMORY_ALIGNMENT - 1 };
|
||||
enum { ALLOC_EXEC = ( sizeof(OpenMPexec) + ALIGN_MASK ) & ~ALIGN_MASK };
|
||||
const size_t member_bytes =
|
||||
sizeof(int64_t) *
|
||||
HostThreadTeamData::align_to_int64( sizeof(HostThreadTeamData) );
|
||||
|
||||
const size_t old_reduce_size = m_pool[0] ? m_pool[0]->m_scratch_reduce_end : 0 ;
|
||||
const size_t old_thread_size = m_pool[0] ? m_pool[0]->m_scratch_thread_end - m_pool[0]->m_scratch_reduce_end : 0 ;
|
||||
HostThreadTeamData * root = m_pool[0] ;
|
||||
|
||||
reduce_size = ( reduce_size + ALIGN_MASK ) & ~ALIGN_MASK ;
|
||||
thread_size = ( thread_size + ALIGN_MASK ) & ~ALIGN_MASK ;
|
||||
const size_t old_pool_reduce = root ? root->pool_reduce_bytes() : 0 ;
|
||||
const size_t old_team_reduce = root ? root->team_reduce_bytes() : 0 ;
|
||||
const size_t old_team_shared = root ? root->team_shared_bytes() : 0 ;
|
||||
const size_t old_thread_local = root ? root->thread_local_bytes() : 0 ;
|
||||
const size_t old_alloc_bytes = root ? ( member_bytes + root->scratch_bytes() ) : 0 ;
|
||||
|
||||
// Requesting allocation and old allocation is too small:
|
||||
// Allocate if any of the old allocation is tool small:
|
||||
|
||||
const bool allocate = ( old_reduce_size < reduce_size ) ||
|
||||
( old_thread_size < thread_size );
|
||||
|
||||
if ( allocate ) {
|
||||
if ( reduce_size < old_reduce_size ) { reduce_size = old_reduce_size ; }
|
||||
if ( thread_size < old_thread_size ) { thread_size = old_thread_size ; }
|
||||
}
|
||||
|
||||
const size_t alloc_size = allocate ? ALLOC_EXEC + reduce_size + thread_size : 0 ;
|
||||
const int pool_size = m_pool_topo[0] ;
|
||||
const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) ||
|
||||
( old_team_reduce < team_reduce_bytes ) ||
|
||||
( old_team_shared < team_shared_bytes ) ||
|
||||
( old_thread_local < thread_local_bytes );
|
||||
|
||||
if ( allocate ) {
|
||||
|
||||
clear_scratch();
|
||||
if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
|
||||
if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
|
||||
if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
|
||||
if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
|
||||
|
||||
const size_t alloc_bytes =
|
||||
member_bytes +
|
||||
HostThreadTeamData::scratch_size( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
|
||||
const int pool_size = omp_get_max_threads();
|
||||
|
||||
Kokkos::HostSpace space ;
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
const int rank_rev = m_map_rank[ omp_get_thread_num() ];
|
||||
const int rank = pool_size - ( rank_rev + 1 );
|
||||
const int rank = m_map_rank[ omp_get_thread_num() ];
|
||||
|
||||
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
|
||||
if ( 0 != m_pool[rank] ) {
|
||||
|
||||
Record * const r = Record::allocate( Kokkos::HostSpace()
|
||||
, "openmp_scratch"
|
||||
, alloc_size );
|
||||
m_pool[rank]->disband_pool();
|
||||
|
||||
Record::increment( r );
|
||||
space.deallocate( m_pool[rank] , old_alloc_bytes );
|
||||
}
|
||||
|
||||
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
|
||||
void * const ptr = space.allocate( alloc_bytes );
|
||||
|
||||
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
|
||||
m_pool[ rank ] = new( ptr ) HostThreadTeamData();
|
||||
|
||||
m_pool[ rank ]->
|
||||
scratch_assign( ((char *)ptr) + member_bytes
|
||||
, alloc_bytes
|
||||
, pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
}
|
||||
/* END #pragma omp parallel */
|
||||
|
||||
HostThreadTeamData::organize_pool( m_pool , pool_size );
|
||||
}
|
||||
}
|
||||
|
||||
@ -197,14 +240,14 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
// Before any other call to OMP query the maximum number of threads
|
||||
// and save the value for re-initialization unit testing.
|
||||
|
||||
//Using omp_get_max_threads(); is problematic in conjunction with
|
||||
//Hwloc on Intel (essentially an initial call to the OpenMP runtime
|
||||
//without a parallel region before will set a process mask for a single core
|
||||
//The runtime will than bind threads for a parallel region to other cores on the
|
||||
//entering the first parallel region and make the process mask the aggregate of
|
||||
//the thread masks. The intend seems to be to make serial code run fast, if you
|
||||
//compile with OpenMP enabled but don't actually use parallel regions or so
|
||||
//static int omp_max_threads = omp_get_max_threads();
|
||||
// Using omp_get_max_threads(); is problematic in conjunction with
|
||||
// Hwloc on Intel (essentially an initial call to the OpenMP runtime
|
||||
// without a parallel region before will set a process mask for a single core
|
||||
// The runtime will than bind threads for a parallel region to other cores on the
|
||||
// entering the first parallel region and make the process mask the aggregate of
|
||||
// the thread masks. The intend seems to be to make serial code run fast, if you
|
||||
// compile with OpenMP enabled but don't actually use parallel regions or so
|
||||
// static int omp_max_threads = omp_get_max_threads();
|
||||
int nthreads = 0;
|
||||
#pragma omp parallel
|
||||
{
|
||||
@ -268,8 +311,6 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
// Call to 'bind_this_thread' is not thread safe so place this whole block in a critical region.
|
||||
// Call to 'new' may not be thread safe as well.
|
||||
|
||||
// Reverse the rank for threads so that the scan operation reduces to the highest rank thread.
|
||||
|
||||
const unsigned omp_rank = omp_get_thread_num();
|
||||
const unsigned thread_r = Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads()
|
||||
? Kokkos::hwloc::bind_this_thread( thread_count , threads_coord )
|
||||
@ -286,7 +327,19 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
Impl::OpenMPexec::m_pool_topo[1] = Impl::s_using_hwloc ? thread_count / use_numa_count : thread_count;
|
||||
Impl::OpenMPexec::m_pool_topo[2] = Impl::s_using_hwloc ? thread_count / ( use_numa_count * use_cores_per_numa ) : 1;
|
||||
|
||||
Impl::OpenMPexec::resize_scratch( 1024 , 1024 );
|
||||
// New, unified host thread team data:
|
||||
{
|
||||
size_t pool_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_reduce_bytes = 32 * thread_count ;
|
||||
size_t team_shared_bytes = 1024 * thread_count ;
|
||||
size_t thread_local_bytes = 1024 ;
|
||||
|
||||
Impl::OpenMPexec::resize_thread_data( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -309,7 +362,7 @@ void OpenMP::initialize( unsigned thread_count ,
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
@ -321,7 +374,8 @@ void OpenMP::finalize()
|
||||
Impl::OpenMPexec::verify_initialized( "OpenMP::finalize" );
|
||||
Impl::OpenMPexec::verify_is_process( "OpenMP::finalize" );
|
||||
|
||||
Impl::OpenMPexec::clear_scratch();
|
||||
// New, unified host thread team data:
|
||||
Impl::OpenMPexec::clear_thread_data();
|
||||
|
||||
Impl::OpenMPexec::m_pool_topo[0] = 0 ;
|
||||
Impl::OpenMPexec::m_pool_topo[1] = 0 ;
|
||||
@ -333,7 +387,7 @@ void OpenMP::finalize()
|
||||
hwloc::unbind_this_thread();
|
||||
}
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -44,13 +44,22 @@
|
||||
#ifndef KOKKOS_OPENMPEXEC_HPP
|
||||
#define KOKKOS_OPENMPEXEC_HPP
|
||||
|
||||
#include <Kokkos_OpenMP.hpp>
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <fstream>
|
||||
|
||||
#include <omp.h>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
@ -60,41 +69,19 @@ namespace Impl {
|
||||
class OpenMPexec {
|
||||
public:
|
||||
|
||||
friend class Kokkos::OpenMP ;
|
||||
|
||||
enum { MAX_THREAD_COUNT = 4096 };
|
||||
|
||||
private:
|
||||
|
||||
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
|
||||
|
||||
static int m_pool_topo[ 4 ];
|
||||
static int m_map_rank[ MAX_THREAD_COUNT ];
|
||||
|
||||
friend class Kokkos::OpenMP ;
|
||||
static HostThreadTeamData * m_pool[ MAX_THREAD_COUNT ];
|
||||
|
||||
int const m_pool_rank ;
|
||||
int const m_pool_rank_rev ;
|
||||
int const m_scratch_exec_end ;
|
||||
int const m_scratch_reduce_end ;
|
||||
int const m_scratch_thread_end ;
|
||||
|
||||
int volatile m_barrier_state ;
|
||||
|
||||
// Members for dynamic scheduling
|
||||
// Which thread am I stealing from currently
|
||||
int m_current_steal_target;
|
||||
// This thread's owned work_range
|
||||
Kokkos::pair<long,long> m_work_range KOKKOS_ALIGN(16);
|
||||
// Team Offset if one thread determines work_range for others
|
||||
long m_team_work_index;
|
||||
|
||||
// Is this thread stealing (i.e. its owned work_range is exhausted
|
||||
bool m_stealing;
|
||||
|
||||
OpenMPexec();
|
||||
OpenMPexec( const OpenMPexec & );
|
||||
OpenMPexec & operator = ( const OpenMPexec & );
|
||||
|
||||
static void clear_scratch();
|
||||
static
|
||||
void clear_thread_data();
|
||||
|
||||
public:
|
||||
|
||||
@ -108,44 +95,6 @@ public:
|
||||
inline static
|
||||
int pool_size( int depth = 0 ) { return m_pool_topo[ depth ]; }
|
||||
|
||||
inline static
|
||||
OpenMPexec * pool_rev( int pool_rank_rev ) { return m_pool[ pool_rank_rev ]; }
|
||||
|
||||
inline int pool_rank() const { return m_pool_rank ; }
|
||||
inline int pool_rank_rev() const { return m_pool_rank_rev ; }
|
||||
|
||||
inline long team_work_index() const { return m_team_work_index ; }
|
||||
|
||||
inline int scratch_reduce_size() const
|
||||
{ return m_scratch_reduce_end - m_scratch_exec_end ; }
|
||||
|
||||
inline int scratch_thread_size() const
|
||||
{ return m_scratch_thread_end - m_scratch_reduce_end ; }
|
||||
|
||||
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
|
||||
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
|
||||
|
||||
inline
|
||||
void state_wait( int state )
|
||||
{ Impl::spinwait( m_barrier_state , state ); }
|
||||
|
||||
inline
|
||||
void state_set( int state ) { m_barrier_state = state ; }
|
||||
|
||||
~OpenMPexec() {}
|
||||
|
||||
OpenMPexec( const int arg_poolRank
|
||||
, const int arg_scratch_exec_size
|
||||
, const int arg_scratch_reduce_size
|
||||
, const int arg_scratch_thread_size )
|
||||
: m_pool_rank( arg_poolRank )
|
||||
, m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
|
||||
, m_scratch_exec_end( arg_scratch_exec_size )
|
||||
, m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size )
|
||||
, m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
|
||||
, m_barrier_state(0)
|
||||
{}
|
||||
|
||||
static void finalize();
|
||||
|
||||
static void initialize( const unsigned team_count ,
|
||||
@ -156,133 +105,20 @@ public:
|
||||
static void verify_is_process( const char * const );
|
||||
static void verify_initialized( const char * const );
|
||||
|
||||
static void resize_scratch( size_t reduce_size , size_t thread_size );
|
||||
|
||||
static
|
||||
void resize_thread_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes );
|
||||
|
||||
inline static
|
||||
OpenMPexec * get_thread_omp() { return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
|
||||
|
||||
/* Dynamic Scheduling related functionality */
|
||||
// Initialize the work range for this thread
|
||||
inline void set_work_range(const long& begin, const long& end, const long& chunk_size) {
|
||||
m_work_range.first = (begin+chunk_size-1)/chunk_size;
|
||||
m_work_range.second = end>0?(end+chunk_size-1)/chunk_size:m_work_range.first;
|
||||
}
|
||||
|
||||
// Claim and index from this thread's range from the beginning
|
||||
inline long get_work_index_begin () {
|
||||
Kokkos::pair<long,long> work_range_new = m_work_range;
|
||||
Kokkos::pair<long,long> work_range_old = work_range_new;
|
||||
if(work_range_old.first>=work_range_old.second)
|
||||
return -1;
|
||||
|
||||
work_range_new.first+=1;
|
||||
|
||||
bool success = false;
|
||||
while(!success) {
|
||||
work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
|
||||
success = ( (work_range_new == work_range_old) ||
|
||||
(work_range_new.first>=work_range_new.second));
|
||||
work_range_old = work_range_new;
|
||||
work_range_new.first+=1;
|
||||
}
|
||||
if(work_range_old.first<work_range_old.second)
|
||||
return work_range_old.first;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Claim and index from this thread's range from the end
|
||||
inline long get_work_index_end () {
|
||||
Kokkos::pair<long,long> work_range_new = m_work_range;
|
||||
Kokkos::pair<long,long> work_range_old = work_range_new;
|
||||
if(work_range_old.first>=work_range_old.second)
|
||||
return -1;
|
||||
work_range_new.second-=1;
|
||||
bool success = false;
|
||||
while(!success) {
|
||||
work_range_new = Kokkos::atomic_compare_exchange(&m_work_range,work_range_old,work_range_new);
|
||||
success = ( (work_range_new == work_range_old) ||
|
||||
(work_range_new.first>=work_range_new.second) );
|
||||
work_range_old = work_range_new;
|
||||
work_range_new.second-=1;
|
||||
}
|
||||
if(work_range_old.first<work_range_old.second)
|
||||
return work_range_old.second-1;
|
||||
else
|
||||
return -1;
|
||||
}
|
||||
|
||||
// Reset the steal target
|
||||
inline void reset_steal_target() {
|
||||
m_current_steal_target = (m_pool_rank+1)%m_pool_topo[0];
|
||||
m_stealing = false;
|
||||
}
|
||||
|
||||
// Reset the steal target
|
||||
inline void reset_steal_target(int team_size) {
|
||||
m_current_steal_target = (m_pool_rank_rev+team_size);
|
||||
if(m_current_steal_target>=m_pool_topo[0])
|
||||
m_current_steal_target = 0;//m_pool_topo[0]-1;
|
||||
m_stealing = false;
|
||||
}
|
||||
|
||||
// Get a steal target; start with my-rank + 1 and go round robin, until arriving at this threads rank
|
||||
// Returns -1 fi no active steal target available
|
||||
inline int get_steal_target() {
|
||||
while(( m_pool[m_current_steal_target]->m_work_range.second <=
|
||||
m_pool[m_current_steal_target]->m_work_range.first ) &&
|
||||
(m_current_steal_target!=m_pool_rank) ) {
|
||||
m_current_steal_target = (m_current_steal_target+1)%m_pool_topo[0];
|
||||
}
|
||||
if(m_current_steal_target == m_pool_rank)
|
||||
return -1;
|
||||
else
|
||||
return m_current_steal_target;
|
||||
}
|
||||
|
||||
inline int get_steal_target(int team_size) {
|
||||
|
||||
while(( m_pool[m_current_steal_target]->m_work_range.second <=
|
||||
m_pool[m_current_steal_target]->m_work_range.first ) &&
|
||||
(m_current_steal_target!=m_pool_rank_rev) ) {
|
||||
if(m_current_steal_target + team_size < m_pool_topo[0])
|
||||
m_current_steal_target = (m_current_steal_target+team_size);
|
||||
else
|
||||
m_current_steal_target = 0;
|
||||
}
|
||||
|
||||
if(m_current_steal_target == m_pool_rank_rev)
|
||||
return -1;
|
||||
else
|
||||
return m_current_steal_target;
|
||||
}
|
||||
|
||||
inline long steal_work_index (int team_size = 0) {
|
||||
long index = -1;
|
||||
int steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
|
||||
while ( (steal_target != -1) && (index == -1)) {
|
||||
index = m_pool[steal_target]->get_work_index_end();
|
||||
if(index == -1)
|
||||
steal_target = team_size>0?get_steal_target(team_size):get_steal_target();
|
||||
}
|
||||
return index;
|
||||
}
|
||||
|
||||
// Get a work index. Claim from owned range until its exhausted, then steal from other thread
|
||||
inline long get_work_index (int team_size = 0) {
|
||||
long work_index = -1;
|
||||
if(!m_stealing) work_index = get_work_index_begin();
|
||||
|
||||
if( work_index == -1) {
|
||||
memory_fence();
|
||||
m_stealing = true;
|
||||
work_index = steal_work_index(team_size);
|
||||
}
|
||||
m_team_work_index = work_index;
|
||||
memory_fence();
|
||||
return work_index;
|
||||
}
|
||||
HostThreadTeamData * get_thread_data() noexcept
|
||||
{ return m_pool[ m_map_rank[ omp_get_thread_num() ] ]; }
|
||||
|
||||
inline static
|
||||
HostThreadTeamData * get_thread_data( int i ) noexcept
|
||||
{ return m_pool[i]; }
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
@ -294,356 +130,6 @@ public:
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class OpenMPexecTeamMember {
|
||||
public:
|
||||
|
||||
enum { TEAM_REDUCE_SIZE = 512 };
|
||||
|
||||
/** \brief Thread states for team synchronization */
|
||||
enum { Active = 0 , Rendezvous = 1 };
|
||||
|
||||
typedef Kokkos::OpenMP execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
Impl::OpenMPexec & m_exec ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
int m_team_scratch_size[2] ;
|
||||
int m_team_base_rev ;
|
||||
int m_team_rank_rev ;
|
||||
int m_team_rank ;
|
||||
int m_team_size ;
|
||||
int m_league_rank ;
|
||||
int m_league_end ;
|
||||
int m_league_size ;
|
||||
|
||||
int m_chunk_size;
|
||||
int m_league_chunk_end;
|
||||
Impl::OpenMPexec & m_team_lead_exec ;
|
||||
int m_invalid_thread;
|
||||
int m_team_alloc;
|
||||
|
||||
// Fan-in team threads, root of the fan-in which does not block returns true
|
||||
inline
|
||||
bool team_fan_in() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_wait( Active );
|
||||
}
|
||||
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec.state_set( Rendezvous );
|
||||
memory_fence();
|
||||
m_exec.state_wait( Rendezvous );
|
||||
}
|
||||
|
||||
return 0 == m_team_rank_rev ;
|
||||
}
|
||||
|
||||
inline
|
||||
void team_fan_out() const
|
||||
{
|
||||
memory_fence();
|
||||
for ( int n = 1 , j ; ( ( j = m_team_rank_rev + n ) < m_team_size ) && ! ( m_team_rank_rev & n ) ; n <<= 1 ) {
|
||||
m_exec.pool_rev( m_team_base_rev + j )->state_set( Active );
|
||||
memory_fence();
|
||||
}
|
||||
}
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_shmem() const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& team_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,1,0) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const execution_space::scratch_memory_space& thread_scratch(int) const
|
||||
{ return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{}
|
||||
#else
|
||||
{
|
||||
if ( 1 < m_team_size && !m_invalid_thread) {
|
||||
team_fan_in();
|
||||
team_fan_out();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
template<class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void team_broadcast(ValueType& value, const int& thread_id) const
|
||||
{
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ }
|
||||
#else
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ValueType) < TEAM_REDUCE_SIZE
|
||||
, ValueType , void >::type type ;
|
||||
|
||||
type volatile * const shared_value =
|
||||
((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
|
||||
|
||||
if ( team_rank() == thread_id ) *shared_value = value;
|
||||
memory_fence();
|
||||
team_barrier(); // Wait for 'thread_id' to write
|
||||
value = *shared_value ;
|
||||
team_barrier(); // Wait for team members to read
|
||||
#endif
|
||||
}
|
||||
|
||||
template< class ValueType, class JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION ValueType
|
||||
team_reduce( const ValueType & value
|
||||
, const JoinOp & op_in ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return ValueType(); }
|
||||
#else
|
||||
{
|
||||
memory_fence();
|
||||
typedef ValueType value_type;
|
||||
const JoinLambdaAdapter<value_type,JoinOp> op(op_in);
|
||||
#endif
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(value_type) < TEAM_REDUCE_SIZE
|
||||
, value_type , void >::type type ;
|
||||
|
||||
type * const local_value = ((type*) m_exec.scratch_thread());
|
||||
|
||||
// Set this thread's contribution
|
||||
*local_value = value ;
|
||||
|
||||
// Fence to make sure the base team member has access:
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
type * const team_value = ((type*) m_exec.pool_rev( m_team_base_rev )->scratch_thread());
|
||||
|
||||
// Join to the team value:
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
op.join( *team_value , *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) );
|
||||
}
|
||||
memory_fence();
|
||||
|
||||
// The base team member may "lap" the other team members,
|
||||
// copy to their local value before proceeding.
|
||||
for ( int i = 1 ; i < m_team_size ; ++i ) {
|
||||
*((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread()) = *team_value ;
|
||||
}
|
||||
|
||||
// Fence to make sure all team members have access
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
return *((type volatile const *)local_value);
|
||||
}
|
||||
#endif
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename ArgType >
|
||||
KOKKOS_INLINE_FUNCTION ArgType team_scan( const ArgType & value , ArgType * const global_accum ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return ArgType(); }
|
||||
#else
|
||||
{
|
||||
// Make sure there is enough scratch space:
|
||||
typedef typename if_c< sizeof(ArgType) < TEAM_REDUCE_SIZE , ArgType , void >::type type ;
|
||||
|
||||
volatile type * const work_value = ((type*) m_exec.scratch_thread());
|
||||
|
||||
*work_value = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
if ( team_fan_in() ) {
|
||||
// The last thread to synchronize returns true, all other threads wait for team_fan_out()
|
||||
// m_team_base[0] == highest ranking team member
|
||||
// m_team_base[ m_team_size - 1 ] == lowest ranking team member
|
||||
//
|
||||
// 1) copy from lower to higher rank, initialize lowest rank to zero
|
||||
// 2) prefix sum from lowest to highest rank, skipping lowest rank
|
||||
|
||||
type accum = 0 ;
|
||||
|
||||
if ( global_accum ) {
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
accum += val ;
|
||||
}
|
||||
accum = atomic_fetch_add( global_accum , accum );
|
||||
}
|
||||
|
||||
for ( int i = m_team_size ; i-- ; ) {
|
||||
type & val = *((type*) m_exec.pool_rev( m_team_base_rev + i )->scratch_thread());
|
||||
const type offset = accum ;
|
||||
accum += val ;
|
||||
val = offset ;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
team_fan_out();
|
||||
|
||||
return *work_value ;
|
||||
}
|
||||
#endif
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
|
||||
{ return this-> template team_scan<Type>( value , 0 ); }
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver
|
||||
|
||||
private:
|
||||
|
||||
typedef execution_space::scratch_memory_space space ;
|
||||
|
||||
public:
|
||||
|
||||
template< class ... Properties >
|
||||
inline
|
||||
OpenMPexecTeamMember( Impl::OpenMPexec & exec
|
||||
, const TeamPolicyInternal< OpenMP, Properties ...> & team
|
||||
, const int shmem_size_L1
|
||||
, const int shmem_size_L2
|
||||
)
|
||||
: m_exec( exec )
|
||||
, m_team_shared(0,0)
|
||||
, m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
|
||||
, m_team_base_rev(0)
|
||||
, m_team_rank_rev(0)
|
||||
, m_team_rank(0)
|
||||
, m_team_size( team.team_size() )
|
||||
, m_league_rank(0)
|
||||
, m_league_end(0)
|
||||
, m_league_size( team.league_size() )
|
||||
, m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
|
||||
, m_league_chunk_end(0)
|
||||
, m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
|
||||
, m_team_alloc( team.team_alloc())
|
||||
{
|
||||
const int pool_rank_rev = m_exec.pool_rank_rev();
|
||||
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
|
||||
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
|
||||
const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc();
|
||||
const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
|
||||
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
|
||||
int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size;
|
||||
if (league_iter_begin < 0) league_iter_begin = 0;
|
||||
if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
|
||||
|
||||
if ((team.team_alloc()>m_team_size)?
|
||||
(pool_team_rank_rev >= m_team_size):
|
||||
(m_exec.pool_size() - pool_num_teams*m_team_size > m_exec.pool_rank())
|
||||
)
|
||||
m_invalid_thread = 1;
|
||||
else
|
||||
m_invalid_thread = 0;
|
||||
|
||||
m_team_rank_rev = pool_team_rank_rev ;
|
||||
if ( pool_team_rank_rev < m_team_size && !m_invalid_thread ) {
|
||||
m_team_base_rev = team.team_alloc() * pool_league_rank_rev ;
|
||||
m_team_rank_rev = pool_team_rank_rev ;
|
||||
m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
|
||||
m_league_end = league_iter_end ;
|
||||
m_league_rank = league_iter_begin ;
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0 );
|
||||
}
|
||||
|
||||
if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
|
||||
m_exec.set_work_range(m_league_rank,m_league_end,m_chunk_size);
|
||||
m_exec.reset_steal_target(m_team_size);
|
||||
}
|
||||
}
|
||||
|
||||
bool valid_static() const
|
||||
{
|
||||
return m_league_rank < m_league_end ;
|
||||
}
|
||||
|
||||
void next_static()
|
||||
{
|
||||
if ( m_league_rank < m_league_end ) {
|
||||
team_barrier();
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0);
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
bool valid_dynamic() {
|
||||
if(m_invalid_thread)
|
||||
return false;
|
||||
if ((m_league_rank < m_league_chunk_end) && (m_league_rank < m_league_size)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if ( m_team_rank_rev == 0 ) {
|
||||
m_team_lead_exec.get_work_index(m_team_alloc);
|
||||
}
|
||||
team_barrier();
|
||||
|
||||
long work_index = m_team_lead_exec.team_work_index();
|
||||
|
||||
m_league_rank = work_index * m_chunk_size;
|
||||
m_league_chunk_end = (work_index +1 ) * m_chunk_size;
|
||||
|
||||
if(m_league_chunk_end > m_league_size) m_league_chunk_end = m_league_size;
|
||||
|
||||
if(m_league_rank>=0)
|
||||
return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
void next_dynamic() {
|
||||
if(m_invalid_thread)
|
||||
return;
|
||||
|
||||
if ( m_league_rank < m_league_chunk_end ) {
|
||||
team_barrier();
|
||||
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
|
||||
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
|
||||
0);
|
||||
}
|
||||
m_league_rank++;
|
||||
}
|
||||
|
||||
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
|
||||
};
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::OpenMP, Properties ... >: public PolicyTraits<Properties ...>
|
||||
{
|
||||
@ -671,8 +157,11 @@ public:
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return traits::execution_space::thread_pool_size(1); }
|
||||
int team_size_max( const FunctorType & ) {
|
||||
int pool_size = traits::execution_space::thread_pool_size(1);
|
||||
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
return pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
}
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
@ -702,7 +191,8 @@ private:
|
||||
, const int team_size_request )
|
||||
{
|
||||
const int pool_size = traits::execution_space::thread_pool_size(0);
|
||||
const int team_max = traits::execution_space::thread_pool_size(1);
|
||||
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
const int team_grain = traits::execution_space::thread_pool_size(2);
|
||||
|
||||
m_league_size = league_size_request ;
|
||||
@ -823,7 +313,7 @@ private:
|
||||
}
|
||||
|
||||
public:
|
||||
typedef Impl::OpenMPexecTeamMember member_type ;
|
||||
typedef Impl::HostThreadTeamMember< Kokkos::OpenMP > member_type ;
|
||||
};
|
||||
} // namespace Impl
|
||||
|
||||
@ -850,216 +340,6 @@ int OpenMP::thread_pool_rank()
|
||||
#endif
|
||||
}
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >
|
||||
TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType& count ) {
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::OpenMPexecTeamMember >
|
||||
TeamThreadRange( const Impl::OpenMPexecTeamMember& thread, const iType1& begin, const iType2& end ) {
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenMPexecTeamMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >
|
||||
ThreadVectorRange(const Impl::OpenMPexecTeamMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember> PerTeam(const Impl::OpenMPexecTeamMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember> PerThread(const Impl::OpenMPexecTeamMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>(thread);
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
|
||||
result = loop_boundaries.thread.team_reduce(result,Impl::JoinAdd<ValueType>());
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember>& loop_boundaries,
|
||||
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
|
||||
init_result = loop_boundaries.thread.team_reduce(result,join);
|
||||
}
|
||||
|
||||
} //namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
/** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
|
||||
loop_boundaries, const Lambda& lambda) {
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a summation of
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
}
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of
|
||||
* val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result.
|
||||
* The input value of init_result is used as initializer for temporary variables of ValueType. Therefore
|
||||
* the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
init_result = result;
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
* for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all vector lanes in the thread and a scan operation is performed.
|
||||
* Depending on the target execution space the operator might be called twice: once with final=false
|
||||
* and once with final=true. When final==true val contains the prefix sum value. The contribution of this
|
||||
* "i" needs to be added to val no matter whether final==true or not. In a serial execution
|
||||
* (i.e. team_size==1) the operator is only called once with final==true. Scan_val will be set
|
||||
* to the final sum value over all vector lanes.
|
||||
* This functionality requires C++11 support.*/
|
||||
template< typename iType, class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::OpenMPexecTeamMember >&
|
||||
loop_boundaries, const FunctorType & lambda) {
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
typedef typename ValueTraits::value_type value_type ;
|
||||
|
||||
value_type scan_val = value_type();
|
||||
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
lambda(i,scan_val,true);
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda) {
|
||||
if(single_struct.team_member.team_rank()==0) lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::OpenMPexecTeamMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
if(single_struct.team_member.team_rank()==0) {
|
||||
lambda(val);
|
||||
}
|
||||
single_struct.team_member.team_broadcast(val,0);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* #ifndef KOKKOS_OPENMPEXEC_HPP */
|
||||
|
||||
@ -1,511 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREAD )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
#include <Kokkos_Qthread.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
// Defines to enable experimental Qthread functionality
|
||||
|
||||
#define QTHREAD_LOCAL_PRIORITY
|
||||
#define CLONED_TASKS
|
||||
|
||||
#include <qthread/qthread.h>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
enum { MAXIMUM_QTHREAD_WORKERS = 1024 };
|
||||
|
||||
/** s_exec is indexed by the reverse rank of the workers
|
||||
* for faster fan-in / fan-out lookups
|
||||
* [ n - 1 , n - 2 , ... , 0 ]
|
||||
*/
|
||||
QthreadExec * s_exec[ MAXIMUM_QTHREAD_WORKERS ];
|
||||
|
||||
int s_number_shepherds = 0 ;
|
||||
int s_number_workers_per_shepherd = 0 ;
|
||||
int s_number_workers = 0 ;
|
||||
|
||||
inline
|
||||
QthreadExec ** worker_exec()
|
||||
{
|
||||
return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local(NULL) + 1 );
|
||||
}
|
||||
|
||||
const int s_base_size = QthreadExec::align_alloc( sizeof(QthreadExec) );
|
||||
|
||||
int s_worker_reduce_end = 0 ; /* End of worker reduction memory */
|
||||
int s_worker_shared_end = 0 ; /* Total of worker scratch memory */
|
||||
int s_worker_shared_begin = 0 ; /* Beginning of worker shared memory */
|
||||
|
||||
QthreadExecFunctionPointer volatile s_active_function = 0 ;
|
||||
const void * volatile s_active_function_arg = 0 ;
|
||||
|
||||
} /* namespace */
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
int Qthread::is_initialized()
|
||||
{
|
||||
return Impl::s_number_workers != 0 ;
|
||||
}
|
||||
|
||||
int Qthread::concurrency()
|
||||
{
|
||||
return Impl::s_number_workers_per_shepherd ;
|
||||
}
|
||||
|
||||
int Qthread::in_parallel()
|
||||
{
|
||||
return Impl::s_active_function != 0 ;
|
||||
}
|
||||
|
||||
void Qthread::initialize( int thread_count )
|
||||
{
|
||||
// Environment variable: QTHREAD_NUM_SHEPHERDS
|
||||
// Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
|
||||
// Environment variable: QTHREAD_HWPAR
|
||||
|
||||
{
|
||||
char buffer[256];
|
||||
snprintf(buffer,sizeof(buffer),"QTHREAD_HWPAR=%d",thread_count);
|
||||
putenv(buffer);
|
||||
}
|
||||
|
||||
const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
|
||||
( thread_count == qthread_num_shepherds() * qthread_num_workers_local(NO_SHEPHERD) ) &&
|
||||
( thread_count == qthread_num_workers() );
|
||||
|
||||
bool ok_symmetry = true ;
|
||||
|
||||
if ( ok_init ) {
|
||||
Impl::s_number_shepherds = qthread_num_shepherds();
|
||||
Impl::s_number_workers_per_shepherd = qthread_num_workers_local(NO_SHEPHERD);
|
||||
Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd ;
|
||||
|
||||
for ( int i = 0 ; ok_symmetry && i < Impl::s_number_shepherds ; ++i ) {
|
||||
ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local(i) );
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! ok_init || ! ok_symmetry ) {
|
||||
std::ostringstream msg ;
|
||||
|
||||
msg << "Kokkos::Qthread::initialize(" << thread_count << ") FAILED" ;
|
||||
msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
|
||||
msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local(NO_SHEPHERD);
|
||||
msg << " : qthread_num_workers = " << qthread_num_workers();
|
||||
|
||||
if ( ! ok_symmetry ) {
|
||||
msg << " : qthread_num_workers_local = {" ;
|
||||
for ( int i = 0 ; i < Impl::s_number_shepherds ; ++i ) {
|
||||
msg << " " << qthread_num_workers_local(i) ;
|
||||
}
|
||||
msg << " }" ;
|
||||
}
|
||||
|
||||
Impl::s_number_workers = 0 ;
|
||||
Impl::s_number_shepherds = 0 ;
|
||||
Impl::s_number_workers_per_shepherd = 0 ;
|
||||
|
||||
if ( ok_init ) { qthread_finalize(); }
|
||||
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
Impl::QthreadExec::resize_worker_scratch( 256 , 256 );
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
}
|
||||
|
||||
void Qthread::finalize()
|
||||
{
|
||||
Impl::QthreadExec::clear_workers();
|
||||
|
||||
if ( Impl::s_number_workers ) {
|
||||
qthread_finalize();
|
||||
}
|
||||
|
||||
Impl::s_number_workers = 0 ;
|
||||
Impl::s_number_shepherds = 0 ;
|
||||
Impl::s_number_workers_per_shepherd = 0 ;
|
||||
}
|
||||
|
||||
void Qthread::print_configuration( std::ostream & s , const bool detail )
|
||||
{
|
||||
s << "Kokkos::Qthread {"
|
||||
<< " num_shepherds(" << Impl::s_number_shepherds << ")"
|
||||
<< " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
|
||||
<< " }" << std::endl ;
|
||||
}
|
||||
|
||||
Qthread & Qthread::instance( int )
|
||||
{
|
||||
static Qthread q ;
|
||||
return q ;
|
||||
}
|
||||
|
||||
void Qthread::fence()
|
||||
{
|
||||
}
|
||||
|
||||
int Qthread::shepherd_size() const { return Impl::s_number_shepherds ; }
|
||||
int Qthread::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd ; }
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace {
|
||||
|
||||
aligned_t driver_exec_all( void * arg )
|
||||
{
|
||||
QthreadExec & exec = **worker_exec();
|
||||
|
||||
(*s_active_function)( exec , s_active_function_arg );
|
||||
|
||||
/*
|
||||
fprintf( stdout
|
||||
, "QthreadExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
|
||||
, exec.worker_rank()
|
||||
, exec.worker_size()
|
||||
, exec.shepherd_rank()
|
||||
, exec.shepherd_size()
|
||||
, exec.shepherd_worker_rank()
|
||||
, exec.shepherd_worker_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
aligned_t driver_resize_worker_scratch( void * arg )
|
||||
{
|
||||
static volatile int lock_begin = 0 ;
|
||||
static volatile int lock_end = 0 ;
|
||||
|
||||
QthreadExec ** const exec = worker_exec();
|
||||
|
||||
//----------------------------------------
|
||||
// Serialize allocation for thread safety
|
||||
|
||||
while ( ! atomic_compare_exchange_strong( & lock_begin , 0 , 1 ) ); // Spin wait to claim lock
|
||||
|
||||
const bool ok = 0 == *exec ;
|
||||
|
||||
if ( ok ) { *exec = (QthreadExec *) malloc( s_base_size + s_worker_shared_end ); }
|
||||
|
||||
lock_begin = 0 ; // release lock
|
||||
|
||||
if ( ok ) { new( *exec ) QthreadExec(); }
|
||||
|
||||
//----------------------------------------
|
||||
// Wait for all calls to complete to insure that each worker has executed.
|
||||
|
||||
if ( s_number_workers == 1 + atomic_fetch_add( & lock_end , 1 ) ) { lock_end = 0 ; }
|
||||
|
||||
while ( lock_end );
|
||||
|
||||
/*
|
||||
fprintf( stdout
|
||||
, "QthreadExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
|
||||
, (**exec).worker_rank()
|
||||
, (**exec).worker_size()
|
||||
, (**exec).shepherd_rank()
|
||||
, (**exec).shepherd_size()
|
||||
, (**exec).shepherd_worker_rank()
|
||||
, (**exec).shepherd_worker_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( ! ok ) {
|
||||
fprintf( stderr , "Kokkos::QthreadExec resize failed\n" );
|
||||
fflush( stderr );
|
||||
}
|
||||
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
void verify_is_process( const char * const label , bool not_active = false )
|
||||
{
|
||||
const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local(NULL);
|
||||
const bool is_active = not_active && ( s_active_function || s_active_function_arg );
|
||||
|
||||
if ( not_process || is_active ) {
|
||||
std::string msg( label );
|
||||
msg.append( " : FAILED" );
|
||||
if ( not_process ) msg.append(" : not called by main process");
|
||||
if ( is_active ) msg.append(" : parallel execution in progress");
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
int QthreadExec::worker_per_shepherd()
|
||||
{
|
||||
return s_number_workers_per_shepherd ;
|
||||
}
|
||||
|
||||
QthreadExec::QthreadExec()
|
||||
{
|
||||
const int shepherd_rank = qthread_shep();
|
||||
const int shepherd_worker_rank = qthread_worker_local(NULL);
|
||||
const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank ;
|
||||
|
||||
m_worker_base = s_exec ;
|
||||
m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
|
||||
m_scratch_alloc = ( (unsigned char *) this ) + s_base_size ;
|
||||
m_reduce_end = s_worker_reduce_end ;
|
||||
m_shepherd_rank = shepherd_rank ;
|
||||
m_shepherd_size = s_number_shepherds ;
|
||||
m_shepherd_worker_rank = shepherd_worker_rank ;
|
||||
m_shepherd_worker_size = s_number_workers_per_shepherd ;
|
||||
m_worker_rank = worker_rank ;
|
||||
m_worker_size = s_number_workers ;
|
||||
m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
||||
void QthreadExec::clear_workers()
|
||||
{
|
||||
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
|
||||
QthreadExec * const exec = s_exec[iwork] ;
|
||||
s_exec[iwork] = 0 ;
|
||||
free( exec );
|
||||
}
|
||||
}
|
||||
|
||||
void QthreadExec::shared_reset( Qthread::scratch_memory_space & space )
|
||||
{
|
||||
new( & space )
|
||||
Qthread::scratch_memory_space(
|
||||
((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin ,
|
||||
s_worker_shared_end - s_worker_shared_begin
|
||||
);
|
||||
}
|
||||
|
||||
void QthreadExec::resize_worker_scratch( const int reduce_size , const int shared_size )
|
||||
{
|
||||
const int exec_all_reduce_alloc = align_alloc( reduce_size );
|
||||
const int shepherd_scan_alloc = align_alloc( 8 );
|
||||
const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
|
||||
|
||||
if ( s_worker_reduce_end < exec_all_reduce_alloc ||
|
||||
s_worker_shared_end < shepherd_shared_end ) {
|
||||
|
||||
/*
|
||||
fprintf( stdout , "QthreadExec::resize\n");
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
// Clear current worker memory before allocating new worker memory
|
||||
clear_workers();
|
||||
|
||||
// Increase the buffers to an aligned allocation
|
||||
s_worker_reduce_end = exec_all_reduce_alloc ;
|
||||
s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc ;
|
||||
s_worker_shared_end = shepherd_shared_end ;
|
||||
|
||||
// Need to query which shepherd this main 'process' is running...
|
||||
|
||||
const int main_shep = qthread_shep();
|
||||
|
||||
// Have each worker resize its memory for proper first-touch
|
||||
#if 0
|
||||
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
|
||||
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i ) {
|
||||
qthread_fork_to( driver_resize_worker_scratch , NULL , NULL , jshep );
|
||||
}}
|
||||
#else
|
||||
// If this function is used before the 'qthread.task_policy' unit test
|
||||
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
|
||||
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
|
||||
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
|
||||
|
||||
if ( num_clone ) {
|
||||
const int ret = qthread_fork_clones_to_local_priority
|
||||
( driver_resize_worker_scratch /* function */
|
||||
, NULL /* function data block */
|
||||
, NULL /* pointer to return value feb */
|
||||
, jshep /* shepherd number */
|
||||
, num_clone - 1 /* number of instances - 1 */
|
||||
);
|
||||
|
||||
assert(ret == QTHREAD_SUCCESS);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
driver_resize_worker_scratch( NULL );
|
||||
|
||||
// Verify all workers allocated
|
||||
|
||||
bool ok = true ;
|
||||
for ( int iwork = 0 ; ok && iwork < s_number_workers ; ++iwork ) { ok = 0 != s_exec[iwork] ; }
|
||||
|
||||
if ( ! ok ) {
|
||||
std::ostringstream msg ;
|
||||
msg << "Kokkos::Impl::QthreadExec::resize : FAILED for workers {" ;
|
||||
for ( int iwork = 0 ; iwork < s_number_workers ; ++iwork ) {
|
||||
if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
|
||||
}
|
||||
msg << " }" ;
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void QthreadExec::exec_all( Qthread & , QthreadExecFunctionPointer func , const void * arg )
|
||||
{
|
||||
verify_is_process("QthreadExec::exec_all(...)",true);
|
||||
|
||||
/*
|
||||
fprintf( stdout , "QthreadExec::exec_all\n");
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
s_active_function = func ;
|
||||
s_active_function_arg = arg ;
|
||||
|
||||
// Need to query which shepherd this main 'process' is running...
|
||||
|
||||
const int main_shep = qthread_shep();
|
||||
|
||||
#if 0
|
||||
for ( int jshep = 0 , iwork = 0 ; jshep < s_number_shepherds ; ++jshep ) {
|
||||
for ( int i = jshep != main_shep ? 0 : 1 ; i < s_number_workers_per_shepherd ; ++i , ++iwork ) {
|
||||
qthread_fork_to( driver_exec_all , NULL , NULL , jshep );
|
||||
}}
|
||||
#else
|
||||
// If this function is used before the 'qthread.task_policy' unit test
|
||||
// the 'qthread.task_policy' unit test fails with a seg-fault within libqthread.so.
|
||||
for ( int jshep = 0 ; jshep < s_number_shepherds ; ++jshep ) {
|
||||
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1 ;
|
||||
|
||||
if ( num_clone ) {
|
||||
const int ret = qthread_fork_clones_to_local_priority
|
||||
( driver_exec_all /* function */
|
||||
, NULL /* function data block */
|
||||
, NULL /* pointer to return value feb */
|
||||
, jshep /* shepherd number */
|
||||
, num_clone - 1 /* number of instances - 1 */
|
||||
);
|
||||
|
||||
assert(ret == QTHREAD_SUCCESS);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
driver_exec_all( NULL );
|
||||
|
||||
s_active_function = 0 ;
|
||||
s_active_function_arg = 0 ;
|
||||
}
|
||||
|
||||
void * QthreadExec::exec_all_reduce_result()
|
||||
{
|
||||
return s_exec[0]->m_scratch_alloc ;
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
QthreadTeamPolicyMember::QthreadTeamPolicyMember()
|
||||
: m_exec( **worker_exec() )
|
||||
, m_team_shared(0,0)
|
||||
, m_team_size( 1 )
|
||||
, m_team_rank( 0 )
|
||||
, m_league_size(1)
|
||||
, m_league_end(1)
|
||||
, m_league_rank(0)
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
QthreadTeamPolicyMember::QthreadTeamPolicyMember( const QthreadTeamPolicyMember::TaskTeam & )
|
||||
: m_exec( **worker_exec() )
|
||||
, m_team_shared(0,0)
|
||||
, m_team_size( s_number_workers_per_shepherd )
|
||||
, m_team_rank( m_exec.shepherd_worker_rank() )
|
||||
, m_league_size(1)
|
||||
, m_league_end(1)
|
||||
, m_league_rank(0)
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
|
||||
|
||||
@ -1,620 +0,0 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_QTHREADEXEC_HPP
|
||||
#define KOKKOS_QTHREADEXEC_HPP
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
class QthreadExec ;
|
||||
|
||||
typedef void (*QthreadExecFunctionPointer)( QthreadExec & , const void * );
|
||||
|
||||
class QthreadExec {
|
||||
private:
|
||||
|
||||
enum { Inactive = 0 , Active = 1 };
|
||||
|
||||
const QthreadExec * const * m_worker_base ;
|
||||
const QthreadExec * const * m_shepherd_base ;
|
||||
|
||||
void * m_scratch_alloc ; ///< Scratch memory [ reduce , team , shared ]
|
||||
int m_reduce_end ; ///< End of scratch reduction memory
|
||||
|
||||
int m_shepherd_rank ;
|
||||
int m_shepherd_size ;
|
||||
|
||||
int m_shepherd_worker_rank ;
|
||||
int m_shepherd_worker_size ;
|
||||
|
||||
/*
|
||||
* m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
|
||||
* m_worker_size = m_shepherd_size * m_shepherd_worker_size
|
||||
*/
|
||||
int m_worker_rank ;
|
||||
int m_worker_size ;
|
||||
|
||||
int mutable volatile m_worker_state ;
|
||||
|
||||
|
||||
friend class Kokkos::Qthread ;
|
||||
|
||||
~QthreadExec();
|
||||
QthreadExec( const QthreadExec & );
|
||||
QthreadExec & operator = ( const QthreadExec & );
|
||||
|
||||
public:
|
||||
|
||||
QthreadExec();
|
||||
|
||||
/** Execute the input function on all available Qthread workers */
|
||||
static void exec_all( Qthread & , QthreadExecFunctionPointer , const void * );
|
||||
|
||||
//----------------------------------------
|
||||
/** Barrier across all workers participating in the 'exec_all' */
|
||||
void exec_all_barrier() const
|
||||
{
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
}
|
||||
|
||||
/** Barrier across workers within the shepherd with rank < team_rank */
|
||||
void shepherd_barrier( const int team_size ) const
|
||||
{
|
||||
if ( m_shepherd_worker_rank < team_size ) {
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
/** Reduce across all workers participating in the 'exec_all' */
|
||||
template< class FunctorType , class ReducerType , class ArgTag >
|
||||
inline
|
||||
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
|
||||
{
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
|
||||
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
const QthreadExec & fan = *m_worker_base[j];
|
||||
|
||||
Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
|
||||
|
||||
ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
/** Scall across all workers participating in the 'exec_all' */
|
||||
template< class FunctorType , class ArgTag >
|
||||
inline
|
||||
void exec_all_scan( const FunctorType & func ) const
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType , ArgTag > ValueInit ;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType , ArgTag > ValueOps ;
|
||||
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_worker_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads
|
||||
// Worker data is in reverse order, so m_worker_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
for ( int i = 1 ; i < m_worker_size ; ++i ) {
|
||||
ValueOps::copy( func
|
||||
, m_worker_base[i-1]->m_scratch_alloc
|
||||
, m_worker_base[i]->m_scratch_alloc
|
||||
);
|
||||
}
|
||||
|
||||
ValueInit::init( func , m_worker_base[m_worker_size-1]->m_scratch_alloc );
|
||||
|
||||
// Join from lower ranking to higher ranking worker.
|
||||
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
|
||||
for ( int i = m_worker_size - 1 ; --i > 0 ; ) {
|
||||
ValueJoin::join( func , m_worker_base[i-1]->m_scratch_alloc , m_worker_base[i]->m_scratch_alloc );
|
||||
}
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ) ; n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class Type>
|
||||
inline
|
||||
volatile Type * shepherd_team_scratch_value() const
|
||||
{ return (volatile Type*)(((unsigned char *) m_scratch_alloc) + m_reduce_end); }
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
void shepherd_broadcast( Type & value , const int team_size , const int team_rank ) const
|
||||
{
|
||||
if ( m_shepherd_base ) {
|
||||
Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value ; }
|
||||
memory_fence();
|
||||
shepherd_barrier( team_size );
|
||||
value = *shared_value ;
|
||||
}
|
||||
}
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
Type shepherd_reduce( const int team_size , const Type & value ) const
|
||||
{
|
||||
*shepherd_team_scratch_value<Type>() = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
else {
|
||||
Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1 ; i < n ; ++i ) {
|
||||
accum += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
for ( int i = 1 ; i < n ; ++i ) {
|
||||
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
template< class JoinOp >
|
||||
inline
|
||||
typename JoinOp::value_type
|
||||
shepherd_reduce( const int team_size
|
||||
, const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const
|
||||
{
|
||||
typedef typename JoinOp::value_type Type ;
|
||||
|
||||
*shepherd_team_scratch_value<Type>() = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
else {
|
||||
volatile Type & accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1 ; i < team_size ; ++i ) {
|
||||
op.join( accum , * m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
|
||||
}
|
||||
for ( int i = 1 ; i < team_size ; ++i ) {
|
||||
* m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum ;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
Type shepherd_scan( const int team_size
|
||||
, const Type & value
|
||||
, Type * const global_value = 0 ) const
|
||||
{
|
||||
*shepherd_team_scratch_value<Type>() = value ;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n , j ;
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_shepherd_base[j]->m_worker_state , QthreadExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadExec::Inactive ;
|
||||
Impl::spinwait( m_worker_state , QthreadExec::Inactive );
|
||||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads
|
||||
// Worker data is in reverse order, so m_shepherd_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
|
||||
Type accum = * m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1 ; i < team_size ; ++i ) {
|
||||
const Type tmp = * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
accum += tmp ;
|
||||
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp ;
|
||||
}
|
||||
|
||||
* m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
|
||||
global_value ? atomic_fetch_add( global_value , accum ) : 0 ;
|
||||
|
||||
// Join from lower ranking to higher ranking worker.
|
||||
for ( int i = team_size ; --i ; ) {
|
||||
* m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += * m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1 ; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ) ; n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadExec::Active ;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
static inline
|
||||
int align_alloc( int size )
|
||||
{
|
||||
enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */};
|
||||
enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
|
||||
return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK ;
|
||||
}
|
||||
|
||||
void shared_reset( Qthread::scratch_memory_space & );
|
||||
|
||||
void * exec_all_reduce_value() const { return m_scratch_alloc ; }
|
||||
|
||||
static void * exec_all_reduce_result();
|
||||
|
||||
static void resize_worker_scratch( const int reduce_size , const int shared_size );
|
||||
static void clear_workers();
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline int worker_rank() const { return m_worker_rank ; }
|
||||
inline int worker_size() const { return m_worker_size ; }
|
||||
inline int shepherd_worker_rank() const { return m_shepherd_worker_rank ; }
|
||||
inline int shepherd_worker_size() const { return m_shepherd_worker_size ; }
|
||||
inline int shepherd_rank() const { return m_shepherd_rank ; }
|
||||
inline int shepherd_size() const { return m_shepherd_size ; }
|
||||
|
||||
static int worker_per_shepherd();
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
class QthreadTeamPolicyMember {
|
||||
private:
|
||||
|
||||
typedef Kokkos::Qthread execution_space ;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space ;
|
||||
|
||||
|
||||
Impl::QthreadExec & m_exec ;
|
||||
scratch_memory_space m_team_shared ;
|
||||
const int m_team_size ;
|
||||
const int m_team_rank ;
|
||||
const int m_league_size ;
|
||||
const int m_league_end ;
|
||||
int m_league_rank ;
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const scratch_memory_space & team_shmem() const { return m_team_shared ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{}
|
||||
#else
|
||||
{ m_exec.shepherd_barrier( m_team_size ); }
|
||||
#endif
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value , int rank ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_broadcast<Type>( value , m_team_size , rank ); }
|
||||
#endif
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_reduce<Type>( m_team_size , value ); }
|
||||
#endif
|
||||
|
||||
template< typename JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
|
||||
team_reduce( const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return typename JoinOp::value_type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_reduce<JoinOp>( m_team_size , value , op ); }
|
||||
#endif
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value ;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_scan<Type>( m_team_size , value ); }
|
||||
#endif
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the
|
||||
* league's parallel execution, be the scan's total.
|
||||
* Parallel execution ordering of the league's teams is non-deterministic.
|
||||
* As such the base value for each team's scan operation is similarly
|
||||
* non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value , Type * const global_accum ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_scan<Type>( m_team_size , value , global_accum ); }
|
||||
#endif
|
||||
|
||||
//----------------------------------------
|
||||
// Private driver for task-team parallel
|
||||
|
||||
struct TaskTeam {};
|
||||
|
||||
QthreadTeamPolicyMember();
|
||||
explicit QthreadTeamPolicyMember( const TaskTeam & );
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver ( for ( member_type i(exec,team); i ; i.next_team() ) { ... }
|
||||
|
||||
// Initialize
|
||||
template< class ... Properties >
|
||||
QthreadTeamPolicyMember( Impl::QthreadExec & exec
|
||||
, const Kokkos::Impl::TeamPolicyInternal<Qthread,Properties...> & team )
|
||||
: m_exec( exec )
|
||||
, m_team_shared(0,0)
|
||||
, m_team_size( team.m_team_size )
|
||||
, m_team_rank( exec.shepherd_worker_rank() )
|
||||
, m_league_size( team.m_league_size )
|
||||
, m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
|
||||
, m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
// Continue
|
||||
operator bool () const { return m_league_rank < m_league_end ; }
|
||||
|
||||
// iterate
|
||||
void next_team() { ++m_league_rank ; m_exec.shared_reset( m_team_shared ); }
|
||||
};
|
||||
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::Qthread , Properties ... >
|
||||
: public PolicyTraits< Properties... >
|
||||
{
|
||||
private:
|
||||
|
||||
const int m_league_size ;
|
||||
const int m_team_size ;
|
||||
const int m_shepherd_iter ;
|
||||
|
||||
public:
|
||||
|
||||
//! Tag this class as a kokkos execution policy
|
||||
typedef TeamPolicyInternal execution_policy ;
|
||||
typedef Qthread execution_space ;
|
||||
typedef PolicyTraits< Properties ... > traits ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return Qthread::instance().shepherd_worker_size(); }
|
||||
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & f )
|
||||
{ return team_size_max( f ); }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & f , const int& )
|
||||
{ return team_size_max( f ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline int team_size() const { return m_team_size ; }
|
||||
inline int league_size() const { return m_league_size ; }
|
||||
|
||||
// One active team per shepherd
|
||||
TeamPolicyInternal( Kokkos::Qthread & q
|
||||
, const int league_size
|
||||
, const int team_size
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( team_size < q.shepherd_worker_size()
|
||||
? team_size : q.shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
|
||||
{
|
||||
}
|
||||
|
||||
// One active team per shepherd
|
||||
TeamPolicyInternal( const int league_size
|
||||
, const int team_size
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( team_size < Qthread::instance().shepherd_worker_size()
|
||||
? team_size : Qthread::instance().shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + Qthread::instance().shepherd_size() - 1 ) / Qthread::instance().shepherd_size() )
|
||||
{
|
||||
}
|
||||
|
||||
typedef Impl::QthreadTeamPolicyMember member_type ;
|
||||
|
||||
friend class Impl::QthreadTeamPolicyMember ;
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #define KOKKOS_QTHREADEXEC_HPP */
|
||||
|
||||
519
lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
Normal file
519
lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.cpp
Normal file
@ -0,0 +1,519 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include <iostream>
|
||||
#include <sstream>
|
||||
#include <utility>
|
||||
|
||||
#include <Kokkos_Qthreads.hpp>
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
|
||||
// Defines to enable experimental Qthreads functionality.
|
||||
//#define QTHREAD_LOCAL_PRIORITY
|
||||
//#define CLONED_TASKS
|
||||
|
||||
//#include <qthread.h>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
namespace {
|
||||
|
||||
enum { MAXIMUM_QTHREADS_WORKERS = 1024 };
|
||||
|
||||
/** s_exec is indexed by the reverse rank of the workers
|
||||
* for faster fan-in / fan-out lookups
|
||||
* [ n - 1, n - 2, ..., 0 ]
|
||||
*/
|
||||
QthreadsExec * s_exec[ MAXIMUM_QTHREADS_WORKERS ];
|
||||
|
||||
int s_number_shepherds = 0;
|
||||
int s_number_workers_per_shepherd = 0;
|
||||
int s_number_workers = 0;
|
||||
|
||||
inline
|
||||
QthreadsExec ** worker_exec()
|
||||
{
|
||||
return s_exec + s_number_workers - ( qthread_shep() * s_number_workers_per_shepherd + qthread_worker_local( NULL ) + 1 );
|
||||
}
|
||||
|
||||
const int s_base_size = QthreadsExec::align_alloc( sizeof(QthreadsExec) );
|
||||
|
||||
int s_worker_reduce_end = 0; // End of worker reduction memory.
|
||||
int s_worker_shared_end = 0; // Total of worker scratch memory.
|
||||
int s_worker_shared_begin = 0; // Beginning of worker shared memory.
|
||||
|
||||
QthreadsExecFunctionPointer volatile s_active_function = 0;
|
||||
const void * volatile s_active_function_arg = 0;
|
||||
|
||||
} // namespace
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
int Qthreads::is_initialized()
|
||||
{
|
||||
return Impl::s_number_workers != 0;
|
||||
}
|
||||
|
||||
int Qthreads::concurrency()
|
||||
{
|
||||
return Impl::s_number_workers_per_shepherd;
|
||||
}
|
||||
|
||||
int Qthreads::in_parallel()
|
||||
{
|
||||
return Impl::s_active_function != 0;
|
||||
}
|
||||
|
||||
void Qthreads::initialize( int thread_count )
|
||||
{
|
||||
// Environment variable: QTHREAD_NUM_SHEPHERDS
|
||||
// Environment variable: QTHREAD_NUM_WORKERS_PER_SHEP
|
||||
// Environment variable: QTHREAD_HWPAR
|
||||
|
||||
{
|
||||
char buffer[256];
|
||||
snprintf( buffer, sizeof(buffer), "QTHREAD_HWPAR=%d", thread_count );
|
||||
putenv( buffer );
|
||||
}
|
||||
|
||||
const bool ok_init = ( QTHREAD_SUCCESS == qthread_initialize() ) &&
|
||||
( thread_count == qthread_num_shepherds() * qthread_num_workers_local( NO_SHEPHERD ) ) &&
|
||||
( thread_count == qthread_num_workers() );
|
||||
|
||||
bool ok_symmetry = true;
|
||||
|
||||
if ( ok_init ) {
|
||||
Impl::s_number_shepherds = qthread_num_shepherds();
|
||||
Impl::s_number_workers_per_shepherd = qthread_num_workers_local( NO_SHEPHERD );
|
||||
Impl::s_number_workers = Impl::s_number_shepherds * Impl::s_number_workers_per_shepherd;
|
||||
|
||||
for ( int i = 0; ok_symmetry && i < Impl::s_number_shepherds; ++i ) {
|
||||
ok_symmetry = ( Impl::s_number_workers_per_shepherd == qthread_num_workers_local( i ) );
|
||||
}
|
||||
}
|
||||
|
||||
if ( ! ok_init || ! ok_symmetry ) {
|
||||
std::ostringstream msg;
|
||||
|
||||
msg << "Kokkos::Qthreads::initialize(" << thread_count << ") FAILED";
|
||||
msg << " : qthread_num_shepherds = " << qthread_num_shepherds();
|
||||
msg << " : qthread_num_workers_per_shepherd = " << qthread_num_workers_local( NO_SHEPHERD );
|
||||
msg << " : qthread_num_workers = " << qthread_num_workers();
|
||||
|
||||
if ( ! ok_symmetry ) {
|
||||
msg << " : qthread_num_workers_local = {";
|
||||
for ( int i = 0; i < Impl::s_number_shepherds; ++i ) {
|
||||
msg << " " << qthread_num_workers_local( i );
|
||||
}
|
||||
msg << " }";
|
||||
}
|
||||
|
||||
Impl::s_number_workers = 0;
|
||||
Impl::s_number_shepherds = 0;
|
||||
Impl::s_number_workers_per_shepherd = 0;
|
||||
|
||||
if ( ok_init ) { qthread_finalize(); }
|
||||
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
|
||||
Impl::QthreadsExec::resize_worker_scratch( 256, 256 );
|
||||
|
||||
// Init the array for used for arbitrarily sized atomics.
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
}
|
||||
|
||||
void Qthreads::finalize()
|
||||
{
|
||||
Impl::QthreadsExec::clear_workers();
|
||||
|
||||
if ( Impl::s_number_workers ) {
|
||||
qthread_finalize();
|
||||
}
|
||||
|
||||
Impl::s_number_workers = 0;
|
||||
Impl::s_number_shepherds = 0;
|
||||
Impl::s_number_workers_per_shepherd = 0;
|
||||
}
|
||||
|
||||
void Qthreads::print_configuration( std::ostream & s, const bool detail )
|
||||
{
|
||||
s << "Kokkos::Qthreads {"
|
||||
<< " num_shepherds(" << Impl::s_number_shepherds << ")"
|
||||
<< " num_workers_per_shepherd(" << Impl::s_number_workers_per_shepherd << ")"
|
||||
<< " }" << std::endl;
|
||||
}
|
||||
|
||||
Qthreads & Qthreads::instance( int )
|
||||
{
|
||||
static Qthreads q;
|
||||
return q;
|
||||
}
|
||||
|
||||
void Qthreads::fence()
|
||||
{
|
||||
}
|
||||
|
||||
int Qthreads::shepherd_size() const { return Impl::s_number_shepherds; }
|
||||
int Qthreads::shepherd_worker_size() const { return Impl::s_number_workers_per_shepherd; }
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
namespace {
|
||||
|
||||
aligned_t driver_exec_all( void * arg )
|
||||
{
|
||||
QthreadsExec & exec = **worker_exec();
|
||||
|
||||
(*s_active_function)( exec, s_active_function_arg );
|
||||
|
||||
/*
|
||||
fprintf( stdout
|
||||
, "QthreadsExec driver worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
|
||||
, exec.worker_rank()
|
||||
, exec.worker_size()
|
||||
, exec.shepherd_rank()
|
||||
, exec.shepherd_size()
|
||||
, exec.shepherd_worker_rank()
|
||||
, exec.shepherd_worker_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
aligned_t driver_resize_worker_scratch( void * arg )
|
||||
{
|
||||
static volatile int lock_begin = 0;
|
||||
static volatile int lock_end = 0;
|
||||
|
||||
QthreadsExec ** const exec = worker_exec();
|
||||
|
||||
//----------------------------------------
|
||||
// Serialize allocation for thread safety.
|
||||
|
||||
while ( ! atomic_compare_exchange_strong( & lock_begin, 0, 1 ) ); // Spin wait to claim lock.
|
||||
|
||||
const bool ok = 0 == *exec;
|
||||
|
||||
if ( ok ) { *exec = (QthreadsExec *) malloc( s_base_size + s_worker_shared_end ); }
|
||||
|
||||
lock_begin = 0; // Release lock.
|
||||
|
||||
if ( ok ) { new( *exec ) QthreadsExec(); }
|
||||
|
||||
//----------------------------------------
|
||||
// Wait for all calls to complete to insure that each worker has executed.
|
||||
|
||||
if ( s_number_workers == 1 + atomic_fetch_add( & lock_end, 1 ) ) { lock_end = 0; }
|
||||
|
||||
while ( lock_end );
|
||||
|
||||
/*
|
||||
fprintf( stdout
|
||||
, "QthreadsExec resize worker(%d:%d) shepherd(%d:%d) shepherd_worker(%d:%d) done\n"
|
||||
, (**exec).worker_rank()
|
||||
, (**exec).worker_size()
|
||||
, (**exec).shepherd_rank()
|
||||
, (**exec).shepherd_size()
|
||||
, (**exec).shepherd_worker_rank()
|
||||
, (**exec).shepherd_worker_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( ! ok ) {
|
||||
fprintf( stderr, "Kokkos::QthreadsExec resize failed\n" );
|
||||
fflush( stderr );
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
void verify_is_process( const char * const label, bool not_active = false )
|
||||
{
|
||||
const bool not_process = 0 != qthread_shep() || 0 != qthread_worker_local( NULL );
|
||||
const bool is_active = not_active && ( s_active_function || s_active_function_arg );
|
||||
|
||||
if ( not_process || is_active ) {
|
||||
std::string msg( label );
|
||||
msg.append( " : FAILED" );
|
||||
if ( not_process ) msg.append(" : not called by main process");
|
||||
if ( is_active ) msg.append(" : parallel execution in progress");
|
||||
Kokkos::Impl::throw_runtime_exception( msg );
|
||||
}
|
||||
}
|
||||
|
||||
} // namespace
|
||||
|
||||
int QthreadsExec::worker_per_shepherd()
|
||||
{
|
||||
return s_number_workers_per_shepherd;
|
||||
}
|
||||
|
||||
QthreadsExec::QthreadsExec()
|
||||
{
|
||||
const int shepherd_rank = qthread_shep();
|
||||
const int shepherd_worker_rank = qthread_worker_local( NULL );
|
||||
const int worker_rank = shepherd_rank * s_number_workers_per_shepherd + shepherd_worker_rank;
|
||||
|
||||
m_worker_base = s_exec;
|
||||
m_shepherd_base = s_exec + s_number_workers_per_shepherd * ( ( s_number_shepherds - ( shepherd_rank + 1 ) ) );
|
||||
m_scratch_alloc = ( (unsigned char *) this ) + s_base_size;
|
||||
m_reduce_end = s_worker_reduce_end;
|
||||
m_shepherd_rank = shepherd_rank;
|
||||
m_shepherd_size = s_number_shepherds;
|
||||
m_shepherd_worker_rank = shepherd_worker_rank;
|
||||
m_shepherd_worker_size = s_number_workers_per_shepherd;
|
||||
m_worker_rank = worker_rank;
|
||||
m_worker_size = s_number_workers;
|
||||
m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
|
||||
void QthreadsExec::clear_workers()
|
||||
{
|
||||
for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
|
||||
QthreadsExec * const exec = s_exec[iwork];
|
||||
s_exec[iwork] = 0;
|
||||
free( exec );
|
||||
}
|
||||
}
|
||||
|
||||
void QthreadsExec::shared_reset( Qthreads::scratch_memory_space & space )
|
||||
{
|
||||
new( & space )
|
||||
Qthreads::scratch_memory_space(
|
||||
((unsigned char *) (**m_shepherd_base).m_scratch_alloc ) + s_worker_shared_begin,
|
||||
s_worker_shared_end - s_worker_shared_begin
|
||||
);
|
||||
}
|
||||
|
||||
void QthreadsExec::resize_worker_scratch( const int reduce_size, const int shared_size )
|
||||
{
|
||||
const int exec_all_reduce_alloc = align_alloc( reduce_size );
|
||||
const int shepherd_scan_alloc = align_alloc( 8 );
|
||||
const int shepherd_shared_end = exec_all_reduce_alloc + shepherd_scan_alloc + align_alloc( shared_size );
|
||||
|
||||
if ( s_worker_reduce_end < exec_all_reduce_alloc ||
|
||||
s_worker_shared_end < shepherd_shared_end ) {
|
||||
|
||||
/*
|
||||
fprintf( stdout, "QthreadsExec::resize\n");
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
// Clear current worker memory before allocating new worker memory.
|
||||
clear_workers();
|
||||
|
||||
// Increase the buffers to an aligned allocation.
|
||||
s_worker_reduce_end = exec_all_reduce_alloc;
|
||||
s_worker_shared_begin = exec_all_reduce_alloc + shepherd_scan_alloc;
|
||||
s_worker_shared_end = shepherd_shared_end;
|
||||
|
||||
// Need to query which shepherd this main 'process' is running.
|
||||
|
||||
const int main_shep = qthread_shep();
|
||||
|
||||
// Have each worker resize its memory for proper first-touch.
|
||||
#if 0
|
||||
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
|
||||
for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i ) {
|
||||
qthread_fork_to( driver_resize_worker_scratch, NULL, NULL, jshep );
|
||||
}
|
||||
}
|
||||
#else
|
||||
// If this function is used before the 'qthreads.task_policy' unit test,
|
||||
// the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
|
||||
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
|
||||
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
|
||||
|
||||
if ( num_clone ) {
|
||||
const int ret = qthread_fork_clones_to_local_priority
|
||||
( driver_resize_worker_scratch // Function
|
||||
, NULL // Function data block
|
||||
, NULL // Pointer to return value feb
|
||||
, jshep // Shepherd number
|
||||
, num_clone - 1 // Number of instances - 1
|
||||
);
|
||||
|
||||
assert( ret == QTHREAD_SUCCESS );
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
driver_resize_worker_scratch( NULL );
|
||||
|
||||
// Verify all workers allocated.
|
||||
|
||||
bool ok = true;
|
||||
for ( int iwork = 0; ok && iwork < s_number_workers; ++iwork ) { ok = 0 != s_exec[iwork]; }
|
||||
|
||||
if ( ! ok ) {
|
||||
std::ostringstream msg;
|
||||
msg << "Kokkos::Impl::QthreadsExec::resize : FAILED for workers {";
|
||||
for ( int iwork = 0; iwork < s_number_workers; ++iwork ) {
|
||||
if ( 0 == s_exec[iwork] ) { msg << " " << ( s_number_workers - ( iwork + 1 ) ); }
|
||||
}
|
||||
msg << " }";
|
||||
Kokkos::Impl::throw_runtime_exception( msg.str() );
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void QthreadsExec::exec_all( Qthreads &, QthreadsExecFunctionPointer func, const void * arg )
|
||||
{
|
||||
verify_is_process("QthreadsExec::exec_all(...)",true);
|
||||
|
||||
/*
|
||||
fprintf( stdout, "QthreadsExec::exec_all\n");
|
||||
fflush(stdout);
|
||||
*/
|
||||
|
||||
s_active_function = func;
|
||||
s_active_function_arg = arg;
|
||||
|
||||
// Need to query which shepherd this main 'process' is running.
|
||||
|
||||
const int main_shep = qthread_shep();
|
||||
|
||||
#if 0
|
||||
for ( int jshep = 0, iwork = 0; jshep < s_number_shepherds; ++jshep ) {
|
||||
for ( int i = jshep != main_shep ? 0 : 1; i < s_number_workers_per_shepherd; ++i, ++iwork ) {
|
||||
qthread_fork_to( driver_exec_all, NULL, NULL, jshep );
|
||||
}
|
||||
}
|
||||
#else
|
||||
// If this function is used before the 'qthreads.task_policy' unit test,
|
||||
// the 'qthreads.task_policy' unit test fails with a seg-fault within libqthread.so.
|
||||
for ( int jshep = 0; jshep < s_number_shepherds; ++jshep ) {
|
||||
const int num_clone = jshep != main_shep ? s_number_workers_per_shepherd : s_number_workers_per_shepherd - 1;
|
||||
|
||||
if ( num_clone ) {
|
||||
const int ret = qthread_fork_clones_to_local_priority
|
||||
( driver_exec_all // Function
|
||||
, NULL // Function data block
|
||||
, NULL // Pointer to return value feb
|
||||
, jshep // Shepherd number
|
||||
, num_clone - 1 // Number of instances - 1
|
||||
);
|
||||
|
||||
assert(ret == QTHREAD_SUCCESS);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
driver_exec_all( NULL );
|
||||
|
||||
s_active_function = 0;
|
||||
s_active_function_arg = 0;
|
||||
}
|
||||
|
||||
void * QthreadsExec::exec_all_reduce_result()
|
||||
{
|
||||
return s_exec[0]->m_scratch_alloc;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
QthreadsTeamPolicyMember::QthreadsTeamPolicyMember()
|
||||
: m_exec( **worker_exec() )
|
||||
, m_team_shared( 0, 0 )
|
||||
, m_team_size( 1 )
|
||||
, m_team_rank( 0 )
|
||||
, m_league_size( 1 )
|
||||
, m_league_end( 1 )
|
||||
, m_league_rank( 0 )
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
QthreadsTeamPolicyMember::QthreadsTeamPolicyMember( const QthreadsTeamPolicyMember::TaskTeam & )
|
||||
: m_exec( **worker_exec() )
|
||||
, m_team_shared( 0, 0 )
|
||||
, m_team_size( s_number_workers_per_shepherd )
|
||||
, m_team_rank( m_exec.shepherd_worker_rank() )
|
||||
, m_league_size( 1 )
|
||||
, m_league_end( 1 )
|
||||
, m_league_rank( 0 )
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
640
lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
Normal file
640
lib/kokkos/core/src/Qthreads/Kokkos_QthreadsExec.hpp
Normal file
@ -0,0 +1,640 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_QTHREADSEXEC_HPP
|
||||
#define KOKKOS_QTHREADSEXEC_HPP
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
class QthreadsExec;
|
||||
|
||||
typedef void (*QthreadsExecFunctionPointer)( QthreadsExec &, const void * );
|
||||
|
||||
class QthreadsExec {
|
||||
private:
|
||||
enum { Inactive = 0, Active = 1 };
|
||||
|
||||
const QthreadsExec * const * m_worker_base;
|
||||
const QthreadsExec * const * m_shepherd_base;
|
||||
|
||||
void * m_scratch_alloc; ///< Scratch memory [ reduce, team, shared ]
|
||||
int m_reduce_end; ///< End of scratch reduction memory
|
||||
|
||||
int m_shepherd_rank;
|
||||
int m_shepherd_size;
|
||||
|
||||
int m_shepherd_worker_rank;
|
||||
int m_shepherd_worker_size;
|
||||
|
||||
/*
|
||||
* m_worker_rank = m_shepherd_rank * m_shepherd_worker_size + m_shepherd_worker_rank
|
||||
* m_worker_size = m_shepherd_size * m_shepherd_worker_size
|
||||
*/
|
||||
int m_worker_rank;
|
||||
int m_worker_size;
|
||||
|
||||
int mutable volatile m_worker_state;
|
||||
|
||||
friend class Kokkos::Qthreads;
|
||||
|
||||
~QthreadsExec();
|
||||
QthreadsExec( const QthreadsExec & );
|
||||
QthreadsExec & operator = ( const QthreadsExec & );
|
||||
|
||||
public:
|
||||
QthreadsExec();
|
||||
|
||||
/** Execute the input function on all available Qthreads workers. */
|
||||
static void exec_all( Qthreads &, QthreadsExecFunctionPointer, const void * );
|
||||
|
||||
/** Barrier across all workers participating in the 'exec_all'. */
|
||||
void exec_all_barrier() const
|
||||
{
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
}
|
||||
|
||||
/** Barrier across workers within the shepherd with rank < team_rank. */
|
||||
void shepherd_barrier( const int team_size ) const
|
||||
{
|
||||
if ( m_shepherd_worker_rank < team_size ) {
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Reduce across all workers participating in the 'exec_all'. */
|
||||
template< class FunctorType, class ReducerType, class ArgTag >
|
||||
inline
|
||||
void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
|
||||
{
|
||||
typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
|
||||
typedef typename ReducerConditional::type ReducerTypeFwd;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin;
|
||||
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
const QthreadsExec & fan = *m_worker_base[j];
|
||||
|
||||
Impl::spinwait_while_equal( fan.m_worker_state, QthreadsExec::Active );
|
||||
|
||||
ValueJoin::join( ReducerConditional::select( func, reduce ), m_scratch_alloc, fan.m_scratch_alloc );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
}
|
||||
|
||||
/** Scan across all workers participating in the 'exec_all'. */
|
||||
template< class FunctorType, class ArgTag >
|
||||
inline
|
||||
void exec_all_scan( const FunctorType & func ) const
|
||||
{
|
||||
typedef Kokkos::Impl::FunctorValueInit< FunctorType, ArgTag > ValueInit;
|
||||
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, ArgTag > ValueJoin;
|
||||
typedef Kokkos::Impl::FunctorValueOps< FunctorType, ArgTag > ValueOps;
|
||||
|
||||
const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_worker_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads.
|
||||
// Worker data is in reverse order, so m_worker_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
for ( int i = 1; i < m_worker_size; ++i ) {
|
||||
ValueOps::copy( func
|
||||
, m_worker_base[i-1]->m_scratch_alloc
|
||||
, m_worker_base[i]->m_scratch_alloc
|
||||
);
|
||||
}
|
||||
|
||||
ValueInit::init( func, m_worker_base[m_worker_size-1]->m_scratch_alloc );
|
||||
|
||||
// Join from lower ranking to higher ranking worker.
|
||||
// Value at m_worker_base[n-1] is zero so skip adding it to m_worker_base[n-2].
|
||||
for ( int i = m_worker_size - 1; --i > 0; ) {
|
||||
ValueJoin::join( func, m_worker_base[i-1]->m_scratch_alloc, m_worker_base[i]->m_scratch_alloc );
|
||||
}
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < m_worker_size ); n <<= 1 ) {
|
||||
m_worker_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
volatile Type * shepherd_team_scratch_value() const
|
||||
{ return (volatile Type*)( ( (unsigned char *) m_scratch_alloc ) + m_reduce_end ); }
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
void shepherd_broadcast( Type & value, const int team_size, const int team_rank ) const
|
||||
{
|
||||
if ( m_shepherd_base ) {
|
||||
Type * const shared_value = m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
if ( m_shepherd_worker_rank == team_rank ) { *shared_value = value; }
|
||||
memory_fence();
|
||||
shepherd_barrier( team_size );
|
||||
value = *shared_value;
|
||||
}
|
||||
}
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
Type shepherd_reduce( const int team_size, const Type & value ) const
|
||||
{
|
||||
volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
|
||||
*shared_value = value;
|
||||
// *shepherd_team_scratch_value<Type>() = value;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
else {
|
||||
Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1; i < n; ++i ) {
|
||||
accum += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
for ( int i = 1; i < n; ++i ) {
|
||||
*m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
template< class JoinOp >
|
||||
inline
|
||||
typename JoinOp::value_type
|
||||
shepherd_reduce( const int team_size
|
||||
, const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const
|
||||
{
|
||||
typedef typename JoinOp::value_type Type;
|
||||
|
||||
volatile Type * const shared_value = shepherd_team_scratch_value<Type>();
|
||||
*shared_value = value;
|
||||
// *shepherd_team_scratch_value<Type>() = value;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
else {
|
||||
volatile Type & accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1; i < team_size; ++i ) {
|
||||
op.join( accum, *m_shepherd_base[i]->shepherd_team_scratch_value<Type>() );
|
||||
}
|
||||
for ( int i = 1; i < team_size; ++i ) {
|
||||
*m_shepherd_base[i]->shepherd_team_scratch_value<Type>() = accum;
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
template< class Type >
|
||||
inline
|
||||
Type shepherd_scan( const int team_size
|
||||
, const Type & value
|
||||
, Type * const global_value = 0 ) const
|
||||
{
|
||||
*shepherd_team_scratch_value<Type>() = value;
|
||||
|
||||
memory_fence();
|
||||
|
||||
const int rev_rank = team_size - ( m_shepherd_worker_rank + 1 );
|
||||
|
||||
int n, j;
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
Impl::spinwait_while_equal( m_shepherd_base[j]->m_worker_state, QthreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_worker_state = QthreadsExec::Inactive;
|
||||
Impl::spinwait_while_equal( m_worker_state, QthreadsExec::Inactive );
|
||||
}
|
||||
else {
|
||||
// Root thread scans across values before releasing threads.
|
||||
// Worker data is in reverse order, so m_shepherd_base[0] is the
|
||||
// highest ranking thread.
|
||||
|
||||
// Copy from lower ranking to higher ranking worker.
|
||||
|
||||
Type accum = *m_shepherd_base[0]->shepherd_team_scratch_value<Type>();
|
||||
for ( int i = 1; i < team_size; ++i ) {
|
||||
const Type tmp = *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
accum += tmp;
|
||||
*m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() = tmp;
|
||||
}
|
||||
|
||||
*m_shepherd_base[team_size-1]->shepherd_team_scratch_value<Type>() =
|
||||
global_value ? atomic_fetch_add( global_value, accum ) : 0;
|
||||
|
||||
// Join from lower ranking to higher ranking worker.
|
||||
for ( int i = team_size; --i; ) {
|
||||
*m_shepherd_base[i-1]->shepherd_team_scratch_value<Type>() += *m_shepherd_base[i]->shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
memory_fence();
|
||||
}
|
||||
|
||||
for ( n = 1; ( ! ( rev_rank & n ) ) && ( ( j = rev_rank + n ) < team_size ); n <<= 1 ) {
|
||||
m_shepherd_base[j]->m_worker_state = QthreadsExec::Active;
|
||||
}
|
||||
|
||||
return *shepherd_team_scratch_value<Type>();
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
static inline
|
||||
int align_alloc( int size )
|
||||
{
|
||||
enum { ALLOC_GRAIN = 1 << 6 /* power of two, 64bytes */ };
|
||||
enum { ALLOC_GRAIN_MASK = ALLOC_GRAIN - 1 };
|
||||
return ( size + ALLOC_GRAIN_MASK ) & ~ALLOC_GRAIN_MASK;
|
||||
}
|
||||
|
||||
void shared_reset( Qthreads::scratch_memory_space & );
|
||||
|
||||
void * exec_all_reduce_value() const { return m_scratch_alloc; }
|
||||
|
||||
static void * exec_all_reduce_result();
|
||||
|
||||
static void resize_worker_scratch( const int reduce_size, const int shared_size );
|
||||
static void clear_workers();
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline int worker_rank() const { return m_worker_rank; }
|
||||
inline int worker_size() const { return m_worker_size; }
|
||||
inline int shepherd_worker_rank() const { return m_shepherd_worker_rank; }
|
||||
inline int shepherd_worker_size() const { return m_shepherd_worker_size; }
|
||||
inline int shepherd_rank() const { return m_shepherd_rank; }
|
||||
inline int shepherd_size() const { return m_shepherd_size; }
|
||||
|
||||
static int worker_per_shepherd();
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
namespace Impl {
|
||||
|
||||
class QthreadsTeamPolicyMember {
|
||||
private:
|
||||
typedef Kokkos::Qthreads execution_space;
|
||||
typedef execution_space::scratch_memory_space scratch_memory_space;
|
||||
|
||||
Impl::QthreadsExec & m_exec;
|
||||
scratch_memory_space m_team_shared;
|
||||
const int m_team_size;
|
||||
const int m_team_rank;
|
||||
const int m_league_size;
|
||||
const int m_league_end;
|
||||
int m_league_rank;
|
||||
|
||||
public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
const scratch_memory_space & team_shmem() const { return m_team_shared; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank; }
|
||||
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size; }
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return m_team_rank; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return m_team_size; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{}
|
||||
#else
|
||||
{ m_exec.shepherd_barrier( m_team_size ); }
|
||||
#endif
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_broadcast( const Type & value, int rank ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_broadcast<Type>( value, m_team_size, rank ); }
|
||||
#endif
|
||||
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_reduce( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_reduce<Type>( m_team_size, value ); }
|
||||
#endif
|
||||
|
||||
template< typename JoinOp >
|
||||
KOKKOS_INLINE_FUNCTION typename JoinOp::value_type
|
||||
team_reduce( const typename JoinOp::value_type & value
|
||||
, const JoinOp & op ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return typename JoinOp::value_type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_reduce<JoinOp>( m_team_size, value, op ); }
|
||||
#endif
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering.
|
||||
*
|
||||
* The highest rank thread can compute the reduction total as
|
||||
* reduction_total = dev.team_scan( value ) + value;
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_scan<Type>( m_team_size, value ); }
|
||||
#endif
|
||||
|
||||
/** \brief Intra-team exclusive prefix sum with team_rank() ordering
|
||||
* with intra-team non-deterministic ordering accumulation.
|
||||
*
|
||||
* The global inter-team accumulation value will, at the end of the league's
|
||||
* parallel execution, be the scan's total. Parallel execution ordering of
|
||||
* the league's teams is non-deterministic. As such the base value for each
|
||||
* team's scan operation is similarly non-deterministic.
|
||||
*/
|
||||
template< typename Type >
|
||||
KOKKOS_INLINE_FUNCTION Type team_scan( const Type & value, Type * const global_accum ) const
|
||||
#if ! defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
{ return Type(); }
|
||||
#else
|
||||
{ return m_exec.template shepherd_scan<Type>( m_team_size, value, global_accum ); }
|
||||
#endif
|
||||
|
||||
//----------------------------------------
|
||||
// Private driver for task-team parallel.
|
||||
|
||||
struct TaskTeam {};
|
||||
|
||||
QthreadsTeamPolicyMember();
|
||||
explicit QthreadsTeamPolicyMember( const TaskTeam & );
|
||||
|
||||
//----------------------------------------
|
||||
// Private for the driver ( for ( member_type i( exec, team ); i; i.next_team() ) { ... }
|
||||
|
||||
// Initialize.
|
||||
template< class ... Properties >
|
||||
QthreadsTeamPolicyMember( Impl::QthreadsExec & exec
|
||||
, const Kokkos::Impl::TeamPolicyInternal< Qthreads, Properties... > & team )
|
||||
: m_exec( exec )
|
||||
, m_team_shared( 0, 0 )
|
||||
, m_team_size( team.m_team_size )
|
||||
, m_team_rank( exec.shepherd_worker_rank() )
|
||||
, m_league_size( team.m_league_size )
|
||||
, m_league_end( team.m_league_size - team.m_shepherd_iter * ( exec.shepherd_size() - ( exec.shepherd_rank() + 1 ) ) )
|
||||
, m_league_rank( m_league_end > team.m_shepherd_iter ? m_league_end - team.m_shepherd_iter : 0 )
|
||||
{
|
||||
m_exec.shared_reset( m_team_shared );
|
||||
}
|
||||
|
||||
// Continue.
|
||||
operator bool () const { return m_league_rank < m_league_end; }
|
||||
|
||||
// Iterate.
|
||||
void next_team() { ++m_league_rank; m_exec.shared_reset( m_team_shared ); }
|
||||
};
|
||||
|
||||
template< class ... Properties >
|
||||
class TeamPolicyInternal< Kokkos::Qthreads, Properties ... >
|
||||
: public PolicyTraits< Properties... >
|
||||
{
|
||||
private:
|
||||
const int m_league_size;
|
||||
const int m_team_size;
|
||||
const int m_shepherd_iter;
|
||||
|
||||
public:
|
||||
//! Tag this class as a kokkos execution policy.
|
||||
typedef TeamPolicyInternal execution_policy;
|
||||
typedef Qthreads execution_space;
|
||||
typedef PolicyTraits< Properties ... > traits;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return Qthreads::instance().shepherd_worker_size(); }
|
||||
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & f )
|
||||
{ return team_size_max( f ); }
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_recommended( const FunctorType & f, const int& )
|
||||
{ return team_size_max( f ); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
inline int team_size() const { return m_team_size; }
|
||||
inline int league_size() const { return m_league_size; }
|
||||
|
||||
// One active team per shepherd.
|
||||
TeamPolicyInternal( Kokkos::Qthreads & q
|
||||
, const int league_size
|
||||
, const int team_size
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( team_size < q.shepherd_worker_size()
|
||||
? team_size : q.shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
|
||||
{}
|
||||
|
||||
// TODO: Make sure this is correct.
|
||||
// One active team per shepherd.
|
||||
TeamPolicyInternal( Kokkos::Qthreads & q
|
||||
, const int league_size
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( q.shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + q.shepherd_size() - 1 ) / q.shepherd_size() )
|
||||
{}
|
||||
|
||||
// One active team per shepherd.
|
||||
TeamPolicyInternal( const int league_size
|
||||
, const int team_size
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( team_size < Qthreads::instance().shepherd_worker_size()
|
||||
? team_size : Qthreads::instance().shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
|
||||
{}
|
||||
|
||||
// TODO: Make sure this is correct.
|
||||
// One active team per shepherd.
|
||||
TeamPolicyInternal( const int league_size
|
||||
, const Kokkos::AUTO_t & /* team_size_request */
|
||||
, const int /* vector_length */ = 0
|
||||
)
|
||||
: m_league_size( league_size )
|
||||
, m_team_size( Qthreads::instance().shepherd_worker_size() )
|
||||
, m_shepherd_iter( ( league_size + Qthreads::instance().shepherd_size() - 1 ) / Qthreads::instance().shepherd_size() )
|
||||
{}
|
||||
|
||||
// TODO: Doesn't do anything yet. Fix this.
|
||||
/** \brief set chunk_size to a discrete value*/
|
||||
inline TeamPolicyInternal set_chunk_size(typename traits::index_type chunk_size_) const {
|
||||
TeamPolicyInternal p = *this;
|
||||
// p.m_chunk_size = chunk_size_;
|
||||
return p;
|
||||
}
|
||||
|
||||
typedef Impl::QthreadsTeamPolicyMember member_type;
|
||||
|
||||
friend class Impl::QthreadsTeamPolicyMember;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif // #define KOKKOS_QTHREADSEXEC_HPP
|
||||
@ -41,8 +41,8 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_QTHREAD_PARALLEL_HPP
|
||||
#define KOKKOS_QTHREAD_PARALLEL_HPP
|
||||
#ifndef KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
#define KOKKOS_QTHREADS_PARALLEL_HPP
|
||||
|
||||
#include <vector>
|
||||
|
||||
@ -51,7 +51,7 @@
|
||||
#include <impl/Kokkos_StaticAssert.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
|
||||
#include <Qthread/Kokkos_QthreadExec.hpp>
|
||||
#include <Qthreads/Kokkos_QthreadsExec.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -63,7 +63,7 @@ namespace Impl {
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelFor< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Qthread
|
||||
, Kokkos::Qthreads
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -99,7 +99,7 @@ private:
|
||||
}
|
||||
|
||||
// Function is called once by every concurrent thread.
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
@ -116,7 +116,7 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
|
||||
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
|
||||
|
||||
}
|
||||
|
||||
@ -134,7 +134,7 @@ template< class FunctorType , class ReducerType , class ... Traits >
|
||||
class ParallelReduce< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, ReducerType
|
||||
, Kokkos::Qthread
|
||||
, Kokkos::Qthreads
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -186,7 +186,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
|
||||
@ -205,10 +205,10 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
QthreadsExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
|
||||
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
|
||||
|
||||
@ -246,11 +246,11 @@ public:
|
||||
template< class FunctorType , class ... Properties >
|
||||
class ParallelFor< FunctorType
|
||||
, TeamPolicy< Properties ... >
|
||||
, Kokkos::Qthread >
|
||||
, Kokkos::Qthreads >
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
|
||||
@ -282,7 +282,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelFor & self = * ((const ParallelFor *) arg );
|
||||
|
||||
@ -297,10 +297,10 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch
|
||||
QthreadsExec::resize_worker_scratch
|
||||
( /* reduction memory */ 0
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelFor::exec , this );
|
||||
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelFor::exec , this );
|
||||
}
|
||||
|
||||
ParallelFor( const FunctorType & arg_functor ,
|
||||
@ -316,12 +316,12 @@ template< class FunctorType , class ReducerType , class ... Properties >
|
||||
class ParallelReduce< FunctorType
|
||||
, TeamPolicy< Properties... >
|
||||
, ReducerType
|
||||
, Kokkos::Qthread
|
||||
, Kokkos::Qthreads
|
||||
>
|
||||
{
|
||||
private:
|
||||
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthread , Properties ... > Policy ;
|
||||
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Qthreads , Properties ... > Policy ;
|
||||
|
||||
typedef typename Policy::work_tag WorkTag ;
|
||||
typedef typename Policy::member_type Member ;
|
||||
@ -365,7 +365,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelReduce & self = * ((const ParallelReduce *) arg );
|
||||
|
||||
@ -383,13 +383,13 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch
|
||||
QthreadsExec::resize_worker_scratch
|
||||
( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
|
||||
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
|
||||
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
|
||||
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelReduce::exec , this );
|
||||
|
||||
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
|
||||
const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result();
|
||||
|
||||
Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
|
||||
|
||||
@ -429,7 +429,7 @@ public:
|
||||
template< class FunctorType , class ... Traits >
|
||||
class ParallelScan< FunctorType
|
||||
, Kokkos::RangePolicy< Traits ... >
|
||||
, Kokkos::Qthread
|
||||
, Kokkos::Qthreads
|
||||
>
|
||||
{
|
||||
private:
|
||||
@ -474,7 +474,7 @@ private:
|
||||
}
|
||||
}
|
||||
|
||||
static void exec( QthreadExec & exec , const void * arg )
|
||||
static void exec( QthreadsExec & exec , const void * arg )
|
||||
{
|
||||
const ParallelScan & self = * ((const ParallelScan *) arg );
|
||||
|
||||
@ -497,8 +497,8 @@ public:
|
||||
inline
|
||||
void execute() const
|
||||
{
|
||||
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelScan::exec , this );
|
||||
QthreadsExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 );
|
||||
Impl::QthreadsExec::exec_all( Qthreads::instance() , & ParallelScan::exec , this );
|
||||
}
|
||||
|
||||
ParallelScan( const FunctorType & arg_functor
|
||||
@ -521,37 +521,37 @@ namespace Kokkos {
|
||||
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >
|
||||
TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType& count )
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >
|
||||
TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType& count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, count );
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, count );
|
||||
}
|
||||
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::QthreadTeamPolicyMember >
|
||||
TeamThreadRange( const Impl::QthreadTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
|
||||
Impl::QthreadsTeamPolicyMember >
|
||||
TeamThreadRange( const Impl::QthreadsTeamPolicyMember& thread, const iType1 & begin, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadTeamPolicyMember >( thread, iType(begin), iType(end) );
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::QthreadsTeamPolicyMember >( thread, iType(begin), iType(end) );
|
||||
}
|
||||
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >
|
||||
ThreadVectorRange(const Impl::QthreadTeamPolicyMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >(thread,count);
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >
|
||||
ThreadVectorRange(const Impl::QthreadsTeamPolicyMember& thread, const iType& count) {
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >(thread,count);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember> PerTeam(const Impl::QthreadTeamPolicyMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
|
||||
Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember> PerTeam(const Impl::QthreadsTeamPolicyMember& thread) {
|
||||
return Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::QthreadTeamPolicyMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>(thread);
|
||||
Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember> PerThread(const Impl::QthreadsTeamPolicyMember& thread) {
|
||||
return Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>(thread);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
@ -560,7 +560,7 @@ Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember> PerThread(const Impl::Qt
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
@ -571,7 +571,7 @@ void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qthrea
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
|
||||
const Lambda & lambda, ValueType& result) {
|
||||
|
||||
result = ValueType();
|
||||
@ -595,7 +595,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember>& loop_boundaries,
|
||||
void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember>& loop_boundaries,
|
||||
const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
@ -615,7 +615,7 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::Qth
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
|
||||
void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
|
||||
loop_boundaries, const Lambda& lambda) {
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
@ -630,7 +630,7 @@ void parallel_for(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qthr
|
||||
* val is performed and put into result. This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
|
||||
loop_boundaries, const Lambda & lambda, ValueType& result) {
|
||||
result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
@ -652,7 +652,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
|
||||
* '1 for *'). This functionality requires C++11 support.*/
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
|
||||
ValueType result = init_result;
|
||||
@ -679,7 +679,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Q
|
||||
* This functionality requires C++11 support.*/
|
||||
template< typename iType, class FunctorType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadTeamPolicyMember >&
|
||||
void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::QthreadsTeamPolicyMember >&
|
||||
loop_boundaries, const FunctorType & lambda) {
|
||||
|
||||
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
|
||||
@ -697,25 +697,25 @@ void parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::Qth
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
|
||||
void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
|
||||
lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda) {
|
||||
void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda) {
|
||||
if(single_struct.team_member.team_rank()==0) lambda();
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::VectorSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
void single(const Impl::VectorSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
lambda(val);
|
||||
}
|
||||
|
||||
template<class FunctorType, class ValueType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
void single(const Impl::ThreadSingleStruct<Impl::QthreadsTeamPolicyMember>& single_struct, const FunctorType& lambda, ValueType& val) {
|
||||
if(single_struct.team_member.team_rank()==0) {
|
||||
lambda(val);
|
||||
}
|
||||
@ -724,4 +724,4 @@ void single(const Impl::ThreadSingleStruct<Impl::QthreadTeamPolicyMember>& singl
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #define KOKKOS_QTHREAD_PARALLEL_HPP */
|
||||
#endif /* #define KOKKOS_QTHREADS_PARALLEL_HPP */
|
||||
320
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
Normal file
320
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.cpp
Normal file
@ -0,0 +1,320 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <Kokkos_Core.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
#include <impl/Kokkos_TaskQueue_impl.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template class TaskQueue< Kokkos::Qthreads > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
TaskExec< Kokkos::Qthreads >::TaskExec()
|
||||
: m_self_exec( 0 ),
|
||||
m_team_exec( 0 ),
|
||||
m_sync_mask( 0 ),
|
||||
m_sync_value( 0 ),
|
||||
m_sync_step( 0 ),
|
||||
m_group_rank( 0 ),
|
||||
m_team_rank( 0 ),
|
||||
m_team_size( 1 )
|
||||
{}
|
||||
|
||||
TaskExec< Kokkos::Qthreads >::
|
||||
TaskExec( Kokkos::Impl::QthreadsExec & arg_exec, int const arg_team_size )
|
||||
: m_self_exec( & arg_exec ),
|
||||
m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) ),
|
||||
m_sync_mask( 0 ),
|
||||
m_sync_value( 0 ),
|
||||
m_sync_step( 0 ),
|
||||
m_group_rank( arg_exec.pool_rank_rev() / arg_team_size ),
|
||||
m_team_rank( arg_exec.pool_rank_rev() % arg_team_size ),
|
||||
m_team_size( arg_team_size )
|
||||
{
|
||||
// This team spans
|
||||
// m_self_exec->pool_rev( team_size * group_rank )
|
||||
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
|
||||
|
||||
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
|
||||
|
||||
sync[0] = int64_t(0) ;
|
||||
sync[1] = int64_t(0) ;
|
||||
|
||||
for ( int i = 0 ; i < m_team_size ; ++i ) {
|
||||
m_sync_value |= int64_t(1) << (8*i);
|
||||
m_sync_mask |= int64_t(3) << (8*i);
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
|
||||
void TaskExec< Kokkos::Qthreads >::team_barrier() const
|
||||
{
|
||||
if ( 1 < m_team_size ) {
|
||||
|
||||
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
|
||||
Kokkos::abort("TaskQueue<Qthreads> scratch_reduce memory too small");
|
||||
}
|
||||
|
||||
// Use team shared memory to synchronize.
|
||||
// Alternate memory locations between barriers to avoid a sequence
|
||||
// of barriers overtaking one another.
|
||||
|
||||
int64_t volatile * const sync =
|
||||
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
|
||||
|
||||
// This team member sets one byte within the sync variable
|
||||
int8_t volatile * const sync_self =
|
||||
((int8_t *) sync) + m_team_rank ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n",
|
||||
m_group_rank,
|
||||
m_team_rank,
|
||||
m_sync_step,
|
||||
m_sync_value,
|
||||
*sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
|
||||
|
||||
while ( m_sync_value != *sync ); // wait for team to arrive
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n",
|
||||
m_group_rank,
|
||||
m_team_rank,
|
||||
m_sync_step,
|
||||
m_sync_value,
|
||||
*sync
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
++m_sync_step ;
|
||||
|
||||
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
|
||||
m_sync_value ^= m_sync_mask ;
|
||||
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Qthreads >::execute
|
||||
( TaskQueue< Kokkos::Qthreads > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Qthreads ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space, void, void > ;
|
||||
using PoolExec = Kokkos::Impl::QthreadsExec ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
// Required: team_size <= 8
|
||||
|
||||
const int team_size = PoolExec::pool_size(2); // Threads per core
|
||||
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
|
||||
|
||||
if ( 8 < team_size ) {
|
||||
Kokkos::abort("TaskQueue<Qthreads> unsupported team size");
|
||||
}
|
||||
|
||||
#pragma omp parallel
|
||||
{
|
||||
PoolExec & self = *PoolExec::get_thread_omp();
|
||||
|
||||
Member single_exec ;
|
||||
Member team_exec( self, team_size );
|
||||
|
||||
// Team shared memory
|
||||
task_root_type * volatile * const task_shared =
|
||||
(task_root_type **) team_exec.m_team_exec->scratch_thread();
|
||||
|
||||
// Barrier across entire Qthreads thread pool to insure initialization
|
||||
#pragma omp barrier
|
||||
|
||||
// Loop until all queues are empty and no tasks in flight
|
||||
|
||||
do {
|
||||
|
||||
// Each team lead attempts to acquire either a thread team task
|
||||
// or collection of single thread tasks for the team.
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
task_root_type * tmp =
|
||||
0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == tmp ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == tmp ; ++j ) {
|
||||
tmp = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
*task_shared = tmp ;
|
||||
|
||||
// Fence to be sure shared_task_array is stored
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
// Whole team waits for every team member to reach this statement
|
||||
team_exec.team_barrier();
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
task_root_type * const task = *task_shared ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n",
|
||||
team_exec.m_group_rank,
|
||||
team_exec.m_team_rank,
|
||||
uintptr_t(task_shared),
|
||||
uintptr_t(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
if ( 0 == task ) break ; // 0 == m_ready_count
|
||||
|
||||
if ( end == task ) {
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
else if ( task_root_type::TaskTeam == task->m_task_type ) {
|
||||
// Thread Team Task
|
||||
(*task->m_apply)( task, & team_exec );
|
||||
|
||||
// The m_apply function performs a barrier
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
// team member #0 completes the task, which may delete the task
|
||||
queue->complete( task );
|
||||
}
|
||||
}
|
||||
else {
|
||||
// Single Thread Task
|
||||
|
||||
if ( 0 == team_exec.team_rank() ) {
|
||||
|
||||
(*task->m_apply)( task, & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
}
|
||||
|
||||
// All team members wait for whole team to reach this statement.
|
||||
// Not necessary to complete the task.
|
||||
// Is necessary to prevent task_shared from being updated
|
||||
// before it is read by all threads.
|
||||
team_exec.team_barrier();
|
||||
}
|
||||
} while(1);
|
||||
}
|
||||
// END #pragma omp parallel
|
||||
|
||||
}
|
||||
|
||||
void TaskQueueSpecialization< Kokkos::Qthreads >::
|
||||
iff_single_thread_recursive_execute
|
||||
( TaskQueue< Kokkos::Qthreads > * const queue )
|
||||
{
|
||||
using execution_space = Kokkos::Qthreads ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space, void, void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
|
||||
if ( 1 == omp_get_num_threads() ) {
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member single_exec ;
|
||||
|
||||
task_root_type * task = end ;
|
||||
|
||||
do {
|
||||
|
||||
task = end ;
|
||||
|
||||
// Loop by priority and then type
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( end == task ) break ;
|
||||
|
||||
(*task->m_apply)( task, & single_exec );
|
||||
|
||||
queue->complete( task );
|
||||
|
||||
} while(1);
|
||||
}
|
||||
}
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_QTHREADS ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
|
||||
156
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
Normal file
156
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Task.hpp
Normal file
@ -0,0 +1,156 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
#define KOKKOS_IMPL_QTHREADS_TASK_HPP
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskQueueSpecialization< Kokkos::Qthreads >
|
||||
{
|
||||
public:
|
||||
|
||||
using execution_space = Kokkos::Qthreads ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
|
||||
|
||||
// Must specify memory space
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const );
|
||||
|
||||
// Must provide task queue execution function
|
||||
static void execute( queue_type * const );
|
||||
|
||||
// Must provide mechanism to set function pointer in
|
||||
// execution space from the host process.
|
||||
template< typename FunctorType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< execution_space,
|
||||
typename FunctorType::value_type,
|
||||
FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Qthreads > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::Qthreads >
|
||||
{
|
||||
private:
|
||||
|
||||
TaskExec( TaskExec && ) = delete ;
|
||||
TaskExec( TaskExec const & ) = delete ;
|
||||
TaskExec & operator = ( TaskExec && ) = delete ;
|
||||
TaskExec & operator = ( TaskExec const & ) = delete ;
|
||||
|
||||
|
||||
using PoolExec = Kokkos::Impl::QthreadsExec ;
|
||||
|
||||
friend class Kokkos::Impl::TaskQueue< Kokkos::Qthreads > ;
|
||||
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Qthreads > ;
|
||||
|
||||
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
|
||||
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
|
||||
int64_t m_sync_mask ;
|
||||
int64_t mutable m_sync_value ;
|
||||
int mutable m_sync_step ;
|
||||
int m_group_rank ; ///< Which "team" subset of thread pool
|
||||
int m_team_rank ; ///< Which thread within a team
|
||||
int m_team_size ;
|
||||
|
||||
TaskExec();
|
||||
TaskExec( PoolExec & arg_exec, int arg_team_size );
|
||||
|
||||
public:
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void * team_shared() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
|
||||
|
||||
int team_shared_size() const
|
||||
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
|
||||
|
||||
/**\brief Whole team enters this function call
|
||||
* before any teeam member returns from
|
||||
* this function call.
|
||||
*/
|
||||
void team_barrier() const ;
|
||||
#else
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
|
||||
#endif
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_rank() const { return m_team_rank ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
int team_size() const { return m_team_size ; }
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_QTHREADS_TASK_HPP */
|
||||
|
||||
@ -41,11 +41,11 @@
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD
|
||||
// Experimental unified task-data parallel manycore LDRD.
|
||||
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_QTHREAD )
|
||||
#if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
@ -56,17 +56,15 @@
|
||||
#include <string>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
|
||||
#include <Qthreads/Kokkos_Qthreads_TaskPolicy.hpp>
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
typedef TaskMember< Kokkos::Qthread , void , void > Task ;
|
||||
typedef TaskMember< Kokkos::Qthreads , void , void > Task ;
|
||||
|
||||
namespace {
|
||||
|
||||
@ -173,16 +171,16 @@ Task::TaskMember( const function_dealloc_type arg_dealloc
|
||||
|
||||
void Task::throw_error_add_dependence() const
|
||||
{
|
||||
std::cerr << "TaskMember< Qthread >::add_dependence ERROR"
|
||||
std::cerr << "TaskMember< Qthreads >::add_dependence ERROR"
|
||||
<< " state(" << m_state << ")"
|
||||
<< " dep_size(" << m_dep_size << ")"
|
||||
<< std::endl ;
|
||||
throw std::runtime_error("TaskMember< Qthread >::add_dependence ERROR");
|
||||
throw std::runtime_error("TaskMember< Qthreads >::add_dependence ERROR");
|
||||
}
|
||||
|
||||
void Task::throw_error_verify_type()
|
||||
{
|
||||
throw std::runtime_error("TaskMember< Qthread >::verify_type ERROR");
|
||||
throw std::runtime_error("TaskMember< Qthreads >::verify_type ERROR");
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
@ -190,7 +188,7 @@ void Task::throw_error_verify_type()
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void Task::assign( Task ** const lhs , Task * rhs , const bool no_throw )
|
||||
{
|
||||
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthread>::assign ERROR" ;
|
||||
static const char msg_error_header[] = "Kokkos::Impl::TaskManager<Kokkos::Qthreads>::assign ERROR" ;
|
||||
static const char msg_error_count[] = ": negative reference count" ;
|
||||
static const char msg_error_complete[] = ": destroy task that is not complete" ;
|
||||
static const char msg_error_dependences[] = ": destroy task that has dependences" ;
|
||||
@ -294,7 +292,7 @@ fflush(stdout);
|
||||
assign( & m_dep[i] , 0 );
|
||||
}
|
||||
|
||||
// Set qthread FEB to full so that dependent tasks are allowed to execute.
|
||||
// Set Qthreads FEB to full so that dependent tasks are allowed to execute.
|
||||
// This 'task' may be deleted immediately following this function call.
|
||||
qthread_fill( & m_qfeb );
|
||||
|
||||
@ -319,10 +317,10 @@ aligned_t Task::qthread_func( void * arg )
|
||||
);
|
||||
|
||||
if ( task->m_apply_team && ! task->m_apply_single ) {
|
||||
Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
|
||||
Kokkos::Impl::QthreadsTeamPolicyMember::TaskTeam task_team_tag ;
|
||||
|
||||
// Initialize team size and rank with shephered info
|
||||
Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
|
||||
Kokkos::Impl::QthreadsTeamPolicyMember member( task_team_tag );
|
||||
|
||||
(*task->m_apply_team)( task , member );
|
||||
|
||||
@ -344,7 +342,7 @@ fflush(stdout);
|
||||
}
|
||||
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
|
||||
// Team hard-wired to one, no cloning
|
||||
Kokkos::Impl::QthreadTeamPolicyMember member ;
|
||||
Kokkos::Impl::QthreadsTeamPolicyMember member ;
|
||||
(*task->m_apply_team)( task , member );
|
||||
task->closeout();
|
||||
}
|
||||
@ -384,8 +382,8 @@ void Task::schedule()
|
||||
// Increment active task count before spawning.
|
||||
Kokkos::atomic_increment( m_active_count );
|
||||
|
||||
// spawn in qthread. must malloc the precondition array and give to qthread.
|
||||
// qthread will eventually free this allocation so memory will not be leaked.
|
||||
// spawn in Qthreads. must malloc the precondition array and give to Qthreads.
|
||||
// Qthreads will eventually free this allocation so memory will not be leaked.
|
||||
|
||||
// concern with thread safety of malloc, does this need to be guarded?
|
||||
aligned_t ** qprecon = (aligned_t **) malloc( ( m_dep_size + 1 ) * sizeof(aligned_t *) );
|
||||
@ -393,7 +391,7 @@ void Task::schedule()
|
||||
qprecon[0] = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
|
||||
|
||||
for ( int i = 0 ; i < m_dep_size ; ++i ) {
|
||||
qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthread precondition flag
|
||||
qprecon[i+1] = & m_dep[i]->m_qfeb ; // Qthreads precondition flag
|
||||
}
|
||||
|
||||
if ( m_apply_team && ! m_apply_single ) {
|
||||
@ -446,7 +444,7 @@ fflush(stdout);
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
TaskPolicy< Kokkos::Qthread >::
|
||||
TaskPolicy< Kokkos::Qthreads >::
|
||||
TaskPolicy
|
||||
( const unsigned /* arg_task_max_count */
|
||||
, const unsigned /* arg_task_max_size */
|
||||
@ -462,7 +460,7 @@ TaskPolicy
|
||||
|
||||
if ( m_team_size != 1 && m_team_size != num_worker_per_shepherd ) {
|
||||
std::ostringstream msg ;
|
||||
msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthread >( "
|
||||
msg << "Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads >( "
|
||||
<< "default_depedence = " << arg_task_default_dependence_capacity
|
||||
<< " , team_size = " << arg_task_team_size
|
||||
<< " ) ERROR, valid team_size arguments are { (omitted) , 1 , " << num_worker_per_shepherd << " }" ;
|
||||
@ -470,14 +468,14 @@ TaskPolicy
|
||||
}
|
||||
}
|
||||
|
||||
TaskPolicy< Kokkos::Qthread >::member_type &
|
||||
TaskPolicy< Kokkos::Qthread >::member_single()
|
||||
TaskPolicy< Kokkos::Qthreads >::member_type &
|
||||
TaskPolicy< Kokkos::Qthreads >::member_single()
|
||||
{
|
||||
static member_type s ;
|
||||
return s ;
|
||||
}
|
||||
|
||||
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
|
||||
void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthreads > & policy )
|
||||
{
|
||||
volatile int * const active_task_count = & policy.m_active_count ;
|
||||
while ( *active_task_count ) qthread_yield();
|
||||
@ -486,6 +484,5 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
|
||||
} // namespace Experimental
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #if defined( KOKKOS_ENABLE_QTHREAD ) */
|
||||
|
||||
#endif // #if defined( KOKKOS_ENABLE_TASKDAG )
|
||||
#endif // #if defined( KOKKOS_ENABLE_QTHREADS )
|
||||
@ -43,15 +43,15 @@
|
||||
|
||||
// Experimental unified task-data parallel manycore LDRD
|
||||
|
||||
#ifndef KOKKOS_QTHREAD_TASKSCHEDULER_HPP
|
||||
#define KOKKOS_QTHREAD_TASKSCHEDULER_HPP
|
||||
#ifndef KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
#define KOKKOS_QTHREADS_TASKSCHEDULER_HPP
|
||||
|
||||
#include <string>
|
||||
#include <typeinfo>
|
||||
#include <stdexcept>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
// Defines to enable experimental Qthread functionality
|
||||
// Defines to enable experimental Qthreads functionality
|
||||
|
||||
#define QTHREAD_LOCAL_PRIORITY
|
||||
#define CLONED_TASKS
|
||||
@ -63,7 +63,7 @@
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#include <Kokkos_Qthread.hpp>
|
||||
#include <Kokkos_Qthreads.hpp>
|
||||
#include <Kokkos_TaskScheduler.hpp>
|
||||
#include <Kokkos_View.hpp>
|
||||
|
||||
@ -78,13 +78,13 @@ namespace Experimental {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskMember< Kokkos::Qthread , void , void >
|
||||
class TaskMember< Kokkos::Qthreads , void , void >
|
||||
{
|
||||
public:
|
||||
|
||||
typedef TaskMember * (* function_verify_type) ( TaskMember * );
|
||||
typedef void (* function_single_type) ( TaskMember * );
|
||||
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
|
||||
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadsTeamPolicyMember & );
|
||||
typedef void (* function_dealloc_type)( TaskMember * );
|
||||
|
||||
private:
|
||||
@ -94,7 +94,7 @@ private:
|
||||
const function_single_type m_apply_single ; ///< Apply function
|
||||
const function_team_type m_apply_team ; ///< Apply function
|
||||
int volatile * const m_active_count ; ///< Count of active tasks on this policy
|
||||
aligned_t m_qfeb ; ///< Qthread full/empty bit
|
||||
aligned_t m_qfeb ; ///< Qthreads full/empty bit
|
||||
TaskMember ** const m_dep ; ///< Dependences
|
||||
const int m_dep_capacity ; ///< Capacity of dependences
|
||||
int m_dep_size ; ///< Actual count of dependences
|
||||
@ -129,7 +129,7 @@ protected :
|
||||
|
||||
~TaskMember();
|
||||
|
||||
// Used by TaskMember< Qthread , ResultType , void >
|
||||
// Used by TaskMember< Qthreads , ResultType , void >
|
||||
TaskMember( const function_verify_type arg_verify
|
||||
, const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
@ -139,7 +139,7 @@ protected :
|
||||
, const unsigned arg_dependence_capacity
|
||||
);
|
||||
|
||||
// Used for TaskMember< Qthread , void , void >
|
||||
// Used for TaskMember< Qthreads , void , void >
|
||||
TaskMember( const function_dealloc_type arg_dealloc
|
||||
, const function_single_type arg_apply_single
|
||||
, const function_team_type arg_apply_team
|
||||
@ -175,15 +175,15 @@ public:
|
||||
/* Inheritence Requirements on task types:
|
||||
* typedef FunctorType::value_type value_type ;
|
||||
* class DerivedTaskType
|
||||
* : public TaskMember< Qthread , value_type , FunctorType >
|
||||
* : public TaskMember< Qthreads , value_type , FunctorType >
|
||||
* { ... };
|
||||
* class TaskMember< Qthread , value_type , FunctorType >
|
||||
* : public TaskMember< Qthread , value_type , void >
|
||||
* class TaskMember< Qthreads , value_type , FunctorType >
|
||||
* : public TaskMember< Qthreads , value_type , void >
|
||||
* , public Functor
|
||||
* { ... };
|
||||
* If value_type != void
|
||||
* class TaskMember< Qthread , value_type , void >
|
||||
* : public TaskMember< Qthread , void , void >
|
||||
* class TaskMember< Qthreads , value_type , void >
|
||||
* : public TaskMember< Qthreads , void , void >
|
||||
*
|
||||
* Allocate space for DerivedTaskType followed by TaskMember*[ dependence_capacity ]
|
||||
*
|
||||
@ -300,10 +300,10 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_single( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
|
||||
|
||||
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
|
||||
// : public TaskMember< Kokkos::Qthread , ResultType , void >
|
||||
// TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
|
||||
// : public TaskMember< Kokkos::Qthreads , ResultType , void >
|
||||
// , public FunctorType
|
||||
// { ... };
|
||||
|
||||
@ -316,10 +316,10 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_single( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
|
||||
|
||||
// TaskMember< Kokkos::Qthread , ResultType , FunctorType >
|
||||
// : public TaskMember< Kokkos::Qthread , ResultType , void >
|
||||
// TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
|
||||
// : public TaskMember< Kokkos::Qthreads , ResultType , void >
|
||||
// , public FunctorType
|
||||
// { ... };
|
||||
|
||||
@ -333,9 +333,9 @@ public:
|
||||
template< class FunctorType , class ResultType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_team( typename std::enable_if< ! std::is_same< ResultType , void >::value , TaskMember * >::type t
|
||||
, Kokkos::Impl::QthreadTeamPolicyMember & member )
|
||||
, Kokkos::Impl::QthreadsTeamPolicyMember & member )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
|
||||
|
||||
derived_type & m = * static_cast< derived_type * >( t );
|
||||
|
||||
@ -345,9 +345,9 @@ public:
|
||||
template< class FunctorType , class ResultType >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void apply_team( typename std::enable_if< std::is_same< ResultType , void >::value , TaskMember * >::type t
|
||||
, Kokkos::Impl::QthreadTeamPolicyMember & member )
|
||||
, Kokkos::Impl::QthreadsTeamPolicyMember & member )
|
||||
{
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , FunctorType > derived_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , ResultType , FunctorType > derived_type ;
|
||||
|
||||
derived_type & m = * static_cast< derived_type * >( t );
|
||||
|
||||
@ -356,7 +356,7 @@ public:
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/** \brief Base class for tasks with a result value in the Qthread execution space.
|
||||
/** \brief Base class for tasks with a result value in the Qthreads execution space.
|
||||
*
|
||||
* The FunctorType must be void because this class is accessed by the
|
||||
* Future class for the task and result value.
|
||||
@ -365,8 +365,8 @@ public:
|
||||
* can correctly static_cast from the 'root class' to this class.
|
||||
*/
|
||||
template < class ResultType >
|
||||
class TaskMember< Kokkos::Qthread , ResultType , void >
|
||||
: public TaskMember< Kokkos::Qthread , void , void >
|
||||
class TaskMember< Kokkos::Qthreads , ResultType , void >
|
||||
: public TaskMember< Kokkos::Qthreads , void , void >
|
||||
{
|
||||
public:
|
||||
|
||||
@ -379,7 +379,7 @@ public:
|
||||
|
||||
protected:
|
||||
|
||||
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , void , void > task_root_type ;
|
||||
typedef task_root_type::function_dealloc_type function_dealloc_type ;
|
||||
typedef task_root_type::function_single_type function_single_type ;
|
||||
typedef task_root_type::function_team_type function_team_type ;
|
||||
@ -404,16 +404,16 @@ protected:
|
||||
};
|
||||
|
||||
template< class ResultType , class FunctorType >
|
||||
class TaskMember< Kokkos::Qthread , ResultType , FunctorType >
|
||||
: public TaskMember< Kokkos::Qthread , ResultType , void >
|
||||
class TaskMember< Kokkos::Qthreads , ResultType , FunctorType >
|
||||
: public TaskMember< Kokkos::Qthreads , ResultType , void >
|
||||
, public FunctorType
|
||||
{
|
||||
public:
|
||||
|
||||
typedef FunctorType functor_type ;
|
||||
|
||||
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
|
||||
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , void , void > task_root_type ;
|
||||
typedef TaskMember< Kokkos::Qthreads , ResultType , void > task_base_type ;
|
||||
typedef task_root_type::function_dealloc_type function_dealloc_type ;
|
||||
typedef task_root_type::function_single_type function_single_type ;
|
||||
typedef task_root_type::function_team_type function_team_type ;
|
||||
@ -447,16 +447,16 @@ public:
|
||||
namespace Kokkos {
|
||||
namespace Experimental {
|
||||
|
||||
void wait( TaskPolicy< Kokkos::Qthread > & );
|
||||
void wait( TaskPolicy< Kokkos::Qthreads > & );
|
||||
|
||||
template<>
|
||||
class TaskPolicy< Kokkos::Qthread >
|
||||
class TaskPolicy< Kokkos::Qthreads >
|
||||
{
|
||||
public:
|
||||
|
||||
typedef Kokkos::Qthread execution_space ;
|
||||
typedef Kokkos::Qthreads execution_space ;
|
||||
typedef TaskPolicy execution_policy ;
|
||||
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
|
||||
typedef Kokkos::Impl::QthreadsTeamPolicyMember member_type ;
|
||||
|
||||
private:
|
||||
|
||||
@ -650,7 +650,7 @@ public:
|
||||
|
||||
static member_type & member_single();
|
||||
|
||||
friend void wait( TaskPolicy< Kokkos::Qthread > & );
|
||||
friend void wait( TaskPolicy< Kokkos::Qthreads > & );
|
||||
};
|
||||
|
||||
} /* namespace Experimental */
|
||||
@ -660,5 +660,5 @@ public:
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */
|
||||
#endif /* #define KOKKOS_QTHREADS_TASK_HPP */
|
||||
|
||||
319
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
Normal file
319
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue.hpp
Normal file
@ -0,0 +1,319 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
/** \brief Manage task allocation, deallocation, and scheduling.
|
||||
*
|
||||
* Task execution is handled here directly for the Qthread implementation.
|
||||
*/
|
||||
template<>
|
||||
class TaskQueue< Kokkos::Qthread > {
|
||||
private:
|
||||
|
||||
using execution_space = Kokkos::Qthread ;
|
||||
using memory_space = Kokkos::HostSpace
|
||||
using device_type = Kokkos::Device< execution_space, memory_space > ;
|
||||
using memory_pool = Kokkos::Experimental::MemoryPool< device_type > ;
|
||||
using task_root_type = Kokkos::Impl::TaskBase< execution_space, void, void > ;
|
||||
|
||||
friend class Kokkos::TaskScheduler< execution_space > ;
|
||||
|
||||
struct Destroy {
|
||||
TaskQueue * m_queue ;
|
||||
void destroy_shared_allocation();
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
enum : int { TASK_STATE_NULL = 0, ///< Does not exist
|
||||
TASK_STATE_CONSTRUCTING = 1, ///< Is under construction
|
||||
TASK_STATE_WAITING = 2, ///< Is waiting for execution
|
||||
TASK_STATE_EXECUTING = 4, ///< Is executing
|
||||
TASK_STATE_RESPAWN = 8, ///< Requested respawn
|
||||
TASK_STATE_COMPLETE = 16 ///< Execution is complete
|
||||
};
|
||||
|
||||
// Queue is organized as [ priority ][ type ]
|
||||
|
||||
memory_pool m_memory ;
|
||||
unsigned m_team_size ; // Number of threads in a team
|
||||
long m_accum_alloc ; // Accumulated number of allocations
|
||||
int m_count_alloc ; // Current number of allocations
|
||||
int m_max_alloc ; // Maximum number of allocations
|
||||
int m_ready_count ; // Number of ready or executing
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
~TaskQueue();
|
||||
TaskQueue() = delete ;
|
||||
TaskQueue( TaskQueue && ) = delete ;
|
||||
TaskQueue( TaskQueue const & ) = delete ;
|
||||
TaskQueue & operator = ( TaskQueue && ) = delete ;
|
||||
TaskQueue & operator = ( TaskQueue const & ) = delete ;
|
||||
|
||||
TaskQueue
|
||||
( const memory_space & arg_space,
|
||||
unsigned const arg_memory_pool_capacity,
|
||||
unsigned const arg_memory_pool_superblock_capacity_log2
|
||||
);
|
||||
|
||||
// Schedule a task
|
||||
// Precondition:
|
||||
// task is not executing
|
||||
// task->m_next is the dependence or zero
|
||||
// Postcondition:
|
||||
// task->m_next is linked list membership
|
||||
KOKKOS_FUNCTION
|
||||
void schedule( task_root_type * const );
|
||||
|
||||
// Reschedule a task
|
||||
// Precondition:
|
||||
// task is in Executing state
|
||||
// task->m_next == LockTag
|
||||
// Postcondition:
|
||||
// task is in Executing-Respawn state
|
||||
// task->m_next == 0 (no dependence)
|
||||
KOKKOS_FUNCTION
|
||||
void reschedule( task_root_type * );
|
||||
|
||||
// Complete a task
|
||||
// Precondition:
|
||||
// task is not executing
|
||||
// task->m_next == LockTag => task is complete
|
||||
// task->m_next != LockTag => task is respawn
|
||||
// Postcondition:
|
||||
// task->m_wait == LockTag => task is complete
|
||||
// task->m_wait != LockTag => task is waiting
|
||||
KOKKOS_FUNCTION
|
||||
void complete( task_root_type * );
|
||||
|
||||
public:
|
||||
|
||||
// If and only if the execution space is a single thread
|
||||
// then execute ready tasks.
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void iff_single_thread_recursive_execute()
|
||||
{
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
specialization::iff_single_thread_recursive_execute( this );
|
||||
#endif
|
||||
}
|
||||
|
||||
void execute() { specialization::execute( this ); }
|
||||
|
||||
template< typename FunctorType >
|
||||
void proc_set_apply( typename task_root_type::function_type * ptr )
|
||||
{
|
||||
specialization::template proc_set_apply< FunctorType >( ptr );
|
||||
}
|
||||
|
||||
// Assign task pointer with reference counting of assigned tasks
|
||||
template< typename LV, typename RV >
|
||||
KOKKOS_FUNCTION static
|
||||
void assign( TaskBase< execution_space, LV, void > ** const lhs,
|
||||
TaskBase< execution_space, RV, void > * const rhs )
|
||||
{
|
||||
using task_lhs = TaskBase< execution_space, LV, void > ;
|
||||
#if 0
|
||||
{
|
||||
printf( "assign( 0x%lx { 0x%lx %d %d }, 0x%lx { 0x%lx %d %d } )\n",
|
||||
uintptr_t( lhs ? *lhs : 0 ),
|
||||
uintptr_t( lhs && *lhs ? (*lhs)->m_next : 0 ),
|
||||
int( lhs && *lhs ? (*lhs)->m_task_type : 0 ),
|
||||
int( lhs && *lhs ? (*lhs)->m_ref_count : 0 ),
|
||||
uintptr_t(rhs),
|
||||
uintptr_t( rhs ? rhs->m_next : 0 ),
|
||||
int( rhs ? rhs->m_task_type : 0 ),
|
||||
int( rhs ? rhs->m_ref_count : 0 )
|
||||
);
|
||||
fflush( stdout );
|
||||
}
|
||||
#endif
|
||||
|
||||
if ( *lhs )
|
||||
{
|
||||
const int count = Kokkos::atomic_fetch_add( &((*lhs)->m_ref_count), -1 );
|
||||
|
||||
if ( ( 1 == count ) && ( (*lhs)->m_state == TASK_STATE_COMPLETE ) ) {
|
||||
// Reference count is zero and task is complete, deallocate.
|
||||
(*lhs)->m_queue->deallocate( *lhs, (*lhs)->m_alloc_size );
|
||||
}
|
||||
else if ( count <= 1 ) {
|
||||
Kokkos::abort("TaskScheduler task has negative reference count or is incomplete" );
|
||||
}
|
||||
|
||||
// GEM: Should I check that there are no dependences here? Can the state
|
||||
// be set to complete while there are still dependences?
|
||||
}
|
||||
|
||||
if ( rhs ) { Kokkos::atomic_fetch_add( &(rhs->m_ref_count), 1 ); }
|
||||
|
||||
// Force write of *lhs
|
||||
|
||||
*static_cast< task_lhs * volatile * >(lhs) = rhs ;
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
size_t allocate_block_size( size_t n ); ///< Actual block size allocated
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
void * allocate( size_t n ); ///< Allocate from the memory pool
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
void deallocate( void * p, size_t n ); ///< Deallocate to the memory pool
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template<>
|
||||
class TaskBase< Kokkos::Qthread, void, void >
|
||||
{
|
||||
public:
|
||||
|
||||
enum : int16_t { TaskTeam = TaskBase< void, void, void >::TaskTeam,
|
||||
TaskSingle = TaskBase< void, void, void >::TaskSingle,
|
||||
Aggregate = TaskBase< void, void, void >::Aggregate };
|
||||
|
||||
enum : uintptr_t { LockTag = TaskBase< void, void, void >::LockTag,
|
||||
EndTag = TaskBase< void, void, void >::EndTag };
|
||||
|
||||
using execution_space = Kokkos::Qthread ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
|
||||
template< typename > friend class Kokkos::TaskScheduler ;
|
||||
|
||||
typedef void (* function_type) ( TaskBase *, void * );
|
||||
|
||||
// sizeof(TaskBase) == 48
|
||||
|
||||
function_type m_apply ; ///< Apply function pointer
|
||||
queue_type * m_queue ; ///< Queue in which this task resides
|
||||
TaskBase * m_dep ; ///< Dependence
|
||||
int32_t m_ref_count ; ///< Reference count
|
||||
int32_t m_alloc_size ; ///< Allocation size
|
||||
int32_t m_dep_count ; ///< Aggregate's number of dependences
|
||||
int16_t m_task_type ; ///< Type of task
|
||||
int16_t m_priority ; ///< Priority of runnable task
|
||||
aligned_t m_qfeb ; ///< Qthread full/empty bit
|
||||
int m_state ; ///< State of the task
|
||||
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
TaskBase & operator = ( const TaskBase & ) = delete ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase() noexcept
|
||||
: m_apply(0),
|
||||
m_queue(0),
|
||||
m_dep(0),
|
||||
m_ref_count(0),
|
||||
m_alloc_size(0),
|
||||
m_dep_count(0),
|
||||
m_task_type( TaskSingle ),
|
||||
m_priority( 1 /* TaskRegularPriority */ ),
|
||||
m_qfeb(0),
|
||||
m_state( queue_type::TASK_STATE_CONSTRUCTING )
|
||||
{
|
||||
qthread_empty( & m_qfeb ); // Set to full when complete
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
static aligned_t qthread_func( void * arg );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskBase ** aggregate_dependences()
|
||||
{ return reinterpret_cast<TaskBase**>( this + 1 ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void requested_respawn()
|
||||
{ return m_state == queue_type::TASK_STATE_RESPAWN; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( TaskBase* dep )
|
||||
{
|
||||
// Assign dependence to m_dep. It will be processed in the subsequent
|
||||
// call to schedule. Error if the dependence is reset.
|
||||
if ( 0 != Kokkos::atomic_exchange( & m_dep, dep ) ) {
|
||||
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
|
||||
}
|
||||
|
||||
if ( 0 != dep ) {
|
||||
// The future may be destroyed upon returning from this call
|
||||
// so increment reference count to track this assignment.
|
||||
Kokkos::atomic_fetch_add( &(dep->m_ref_count), 1 );
|
||||
}
|
||||
}
|
||||
|
||||
using get_return_type = void ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
get_return_type get() const {}
|
||||
};
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
436
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
Normal file
436
lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_TaskQueue_impl.hpp
Normal file
@ -0,0 +1,436 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#if defined( KOKKOS_ENABLE_TASKPOLICY )
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
void TaskQueue< ExecSpace >::Destroy::destroy_shared_allocation()
|
||||
{
|
||||
m_queue->~TaskQueue();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
TaskQueue< ExecSpace >::TaskQueue
|
||||
( const TaskQueue< ExecSpace >::memory_space & arg_space,
|
||||
unsigned const arg_memory_pool_capacity,
|
||||
unsigned const arg_memory_pool_superblock_capacity_log2 )
|
||||
: m_memory( arg_space,
|
||||
arg_memory_pool_capacity,
|
||||
arg_memory_pool_superblock_capacity_log2 )
|
||||
m_team_size( unsigned( qthread_num_workers_local(NO_SHEPHERD) ) ),
|
||||
m_accum_alloc(0),
|
||||
m_count_alloc(0),
|
||||
m_max_alloc(0),
|
||||
m_ready_count(0)
|
||||
{}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
TaskQueue< ExecSpace >::~TaskQueue()
|
||||
{
|
||||
// Verify that ready count is zero.
|
||||
if ( 0 != m_ready_count ) {
|
||||
Kokkos::abort("TaskQueue::~TaskQueue ERROR: has ready or executing tasks");
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
size_t TaskQueue< ExecSpace >::allocate_block_size( size_t n )
|
||||
{
|
||||
return m_memory.allocate_block_size( n );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void * TaskQueue< ExecSpace >::allocate( size_t n )
|
||||
{
|
||||
void * const p = m_memory.allocate(n);
|
||||
|
||||
if ( p ) {
|
||||
Kokkos::atomic_increment( & m_accum_alloc );
|
||||
Kokkos::atomic_increment( & m_count_alloc );
|
||||
|
||||
if ( m_max_alloc < m_count_alloc ) m_max_alloc = m_count_alloc ;
|
||||
}
|
||||
|
||||
return p ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::deallocate( void * p, size_t n )
|
||||
{
|
||||
m_memory.deallocate( p, n );
|
||||
Kokkos::atomic_decrement( & m_count_alloc );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::schedule
|
||||
( TaskQueue< ExecSpace >::task_root_type * const task )
|
||||
{
|
||||
#if 0
|
||||
printf( "schedule( 0x%lx { %d %d %d }\n",
|
||||
uintptr_t(task),
|
||||
task->m_task_type,
|
||||
task->m_priority,
|
||||
task->m_ref_count );
|
||||
#endif
|
||||
|
||||
// The task has been constructed and is waiting to be executed.
|
||||
task->m_state = TASK_STATE_WAITING ;
|
||||
|
||||
if ( task->m_task_type != task_root_type::Aggregate ) {
|
||||
// Scheduling a single or team task.
|
||||
|
||||
// Increment active task count before spawning.
|
||||
Kokkos::atomic_increment( m_ready_count );
|
||||
|
||||
if ( task->m_dep == 0 ) {
|
||||
// Schedule a task with no dependences.
|
||||
|
||||
if ( task_root_type::TaskTeam == task->m_task_type && m_team_size > 1 ) {
|
||||
// If more than one shepherd spawn on a shepherd other than this shepherd
|
||||
const int num_shepherd = qthread_num_shepherds();
|
||||
const int this_shepherd = qthread_shep();
|
||||
int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
|
||||
qthread_shep(),
|
||||
qthread_worker_local(NULL),
|
||||
reinterpret_cast<unsigned long>(this),
|
||||
spawn_shepherd,
|
||||
m_team_size - 1
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
qthread_spawn_cloneable(
|
||||
& task_root_type::qthread_func,
|
||||
task,
|
||||
0,
|
||||
NULL,
|
||||
0, // no depenedences
|
||||
0, // dependences array
|
||||
spawn_shepherd,
|
||||
unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
|
||||
m_team_size - 1
|
||||
);
|
||||
}
|
||||
else {
|
||||
qthread_spawn(
|
||||
& task_root_type::qthread_func,
|
||||
task,
|
||||
0,
|
||||
NULL,
|
||||
0, // no depenedences
|
||||
0, // dependences array
|
||||
NO_SHEPHERD,
|
||||
QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
|
||||
);
|
||||
}
|
||||
}
|
||||
else if ( task->m_dep->m_task_type != task_root_type::Aggregate )
|
||||
// Malloc the precondition array to pass to qthread_spawn(). For
|
||||
// non-aggregate tasks, it is a single pointer since there are no
|
||||
// dependences. Qthreads will eventually free this allocation so memory will
|
||||
// not be leaked. Is malloc thread-safe? Should this call be guarded? The
|
||||
// memory can't be allocated from the pool allocator because Qthreads frees
|
||||
// it using free().
|
||||
aligned_t ** qprecon = (aligned_t **) malloc( sizeof(aligned_t *) );
|
||||
|
||||
*qprecon = reinterpret_cast<aligned_t *>( uintptr_t(m_dep_size) );
|
||||
|
||||
if ( task->m_task_type == task_root_type::TaskTeam && m_team_size > 1) {
|
||||
// If more than one shepherd spawn on a shepherd other than this shepherd
|
||||
const int num_shepherd = qthread_num_shepherds();
|
||||
const int this_shepherd = qthread_shep();
|
||||
int spawn_shepherd = ( this_shepherd + 1 ) % num_shepherd ;
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"worker(%d.%d) task 0x%.12lx spawning on shepherd(%d) clone(%d)\n",
|
||||
qthread_shep(),
|
||||
qthread_worker_local(NULL),
|
||||
reinterpret_cast<unsigned long>(this),
|
||||
spawn_shepherd,
|
||||
m_team_size - 1
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
qthread_spawn_cloneable(
|
||||
& Task::qthread_func,
|
||||
this,
|
||||
0,
|
||||
NULL,
|
||||
m_dep_size,
|
||||
qprecon, /* dependences */
|
||||
spawn_shepherd,
|
||||
unsigned( QTHREAD_SPAWN_SIMPLE | QTHREAD_SPAWN_LOCAL_PRIORITY ),
|
||||
m_team_size - 1
|
||||
);
|
||||
}
|
||||
else {
|
||||
qthread_spawn(
|
||||
& Task::qthread_func, /* function */
|
||||
this, /* function argument */
|
||||
0,
|
||||
NULL,
|
||||
m_dep_size,
|
||||
qprecon, /* dependences */
|
||||
NO_SHEPHERD,
|
||||
QTHREAD_SPAWN_SIMPLE /* allows optimization for non-blocking task */
|
||||
);
|
||||
}
|
||||
}
|
||||
else {
|
||||
// GEM: How do I handle an aggregate (when_all) task?
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::reschedule( task_root_type * task )
|
||||
{
|
||||
// Precondition:
|
||||
// task is in Executing state
|
||||
// task->m_next == LockTag
|
||||
//
|
||||
// Postcondition:
|
||||
// task is in Executing-Respawn state
|
||||
// task->m_next == 0 (no dependence)
|
||||
|
||||
task_root_type * const zero = (task_root_type *) 0 ;
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
|
||||
if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
|
||||
Kokkos::abort("TaskScheduler::respawn ERROR: already respawned");
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::complete
|
||||
( TaskQueue< ExecSpace >::task_root_type * task )
|
||||
{
|
||||
// Complete a runnable task that has finished executing
|
||||
// or a when_all task when all of its dependeneces are complete.
|
||||
|
||||
task_root_type * const zero = (task_root_type *) 0 ;
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
#if 0
|
||||
printf( "complete( 0x%lx { 0x%lx 0x%lx %d %d %d }\n",
|
||||
uintptr_t(task),
|
||||
uintptr_t(task->m_wait),
|
||||
uintptr_t(task->m_next),
|
||||
task->m_task_type,
|
||||
task->m_priority,
|
||||
task->m_ref_count
|
||||
);
|
||||
fflush( stdout );
|
||||
#endif
|
||||
|
||||
const bool runnable = task_root_type::Aggregate != task->m_task_type ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( runnable && lock != task->m_next ) {
|
||||
// Is a runnable task has finished executing and requested respawn.
|
||||
// Schedule the task for subsequent execution.
|
||||
|
||||
schedule( task );
|
||||
}
|
||||
//----------------------------------------
|
||||
else {
|
||||
// Is either an aggregate or a runnable task that executed
|
||||
// and did not respawn. Transition this task to complete.
|
||||
|
||||
// If 'task' is an aggregate then any of the runnable tasks that
|
||||
// it depends upon may be attempting to complete this 'task'.
|
||||
// Must only transition a task once to complete status.
|
||||
// This is controled by atomically locking the wait queue.
|
||||
|
||||
// Stop other tasks from adding themselves to this task's wait queue
|
||||
// by locking the head of this task's wait queue.
|
||||
|
||||
task_root_type * x = Kokkos::atomic_exchange( & task->m_wait, lock );
|
||||
|
||||
if ( x != (task_root_type *) lock ) {
|
||||
|
||||
// This thread has transitioned this 'task' to complete.
|
||||
// 'task' is no longer in a queue and is not executing
|
||||
// so decrement the reference count from 'task's creation.
|
||||
// If no other references to this 'task' then it will be deleted.
|
||||
|
||||
TaskQueue::assign( & task, zero );
|
||||
|
||||
// This thread has exclusive access to the wait list so
|
||||
// the concurrency-safe pop_task function is not needed.
|
||||
// Schedule the tasks that have been waiting on the input 'task',
|
||||
// which may have been deleted.
|
||||
|
||||
while ( x != end ) {
|
||||
|
||||
// Set x->m_next = zero <= no dependence
|
||||
|
||||
task_root_type * const next =
|
||||
(task_root_type *) Kokkos::atomic_exchange( & x->m_next, zero );
|
||||
|
||||
schedule( x );
|
||||
|
||||
x = next ;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if ( runnable ) {
|
||||
// A runnable task was popped from a ready queue and executed.
|
||||
// If respawned into a ready queue then the ready count was incremented
|
||||
// so decrement whether respawned or not.
|
||||
Kokkos::atomic_decrement( & m_ready_count );
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
aligned_t
|
||||
TaskBase< Kokkos::Qthreads, void, void >::qthread_func( void * arg )
|
||||
{
|
||||
using execution_space = Kokkos::Qthreads ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = Kokkos::Impl::QthreadsTeamPolicyMember;
|
||||
|
||||
task_root_type * const task = reinterpret_cast< task_root_type * >( arg );
|
||||
|
||||
// First member of the team change state to executing.
|
||||
// Use compare-exchange to avoid race condition with a respawn.
|
||||
Kokkos::atomic_compare_exchange_strong( & task->m_state,
|
||||
queue_type::TASK_STATE_WAITING,
|
||||
queue_type::TASK_STATE_EXECUTING
|
||||
);
|
||||
|
||||
if ( task_root_type::TaskTeam == task->m_task_type )
|
||||
{
|
||||
if ( 1 < task->m_queue->m_team_size ) {
|
||||
// Team task with team size of more than 1.
|
||||
Member::TaskTeam task_team_tag ;
|
||||
|
||||
// Initialize team size and rank with shephered info
|
||||
Member member( task_team_tag );
|
||||
|
||||
(*task->m_apply)( task , & member );
|
||||
|
||||
#if 0
|
||||
fprintf( stdout,
|
||||
"worker(%d.%d) task 0x%.12lx executed by member(%d:%d)\n",
|
||||
qthread_shep(),
|
||||
qthread_worker_local(NULL),
|
||||
reinterpret_cast<unsigned long>(task),
|
||||
member.team_rank(),
|
||||
member.team_size()
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
member.team_barrier();
|
||||
if ( member.team_rank() == 0 ) task->closeout();
|
||||
member.team_barrier();
|
||||
}
|
||||
else {
|
||||
// Team task with team size of 1.
|
||||
Member member ;
|
||||
(*task->m_apply)( task , & member );
|
||||
task->closeout();
|
||||
}
|
||||
}
|
||||
else {
|
||||
(*task->m_apply)( task );
|
||||
task->closeout();
|
||||
}
|
||||
|
||||
#if 0
|
||||
fprintf( stdout
|
||||
, "worker(%d.%d) task 0x%.12lx return\n"
|
||||
, qthread_shep()
|
||||
, qthread_worker_local(NULL)
|
||||
, reinterpret_cast<unsigned long>(task)
|
||||
);
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
return 0 ;
|
||||
}
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
|
||||
|
||||
@ -22,4 +22,3 @@ sh autogen.sh
|
||||
# install
|
||||
|
||||
make install
|
||||
|
||||
@ -264,7 +264,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * )
|
||||
const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 );
|
||||
|
||||
for ( int i = 0 ; i < n ; ++i ) {
|
||||
Impl::spinwait( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
exec.m_pool_state = ThreadsExec::Inactive ;
|
||||
@ -308,7 +308,7 @@ void ThreadsExec::fence()
|
||||
{
|
||||
if ( s_thread_pool_size[0] ) {
|
||||
// Wait for the root thread to complete:
|
||||
Impl::spinwait( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
s_current_function = 0 ;
|
||||
@ -724,7 +724,7 @@ void ThreadsExec::initialize( unsigned thread_count ,
|
||||
// Init the array for used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
@ -777,7 +777,7 @@ void ThreadsExec::finalize()
|
||||
s_threads_process.m_pool_fan_size = 0 ;
|
||||
s_threads_process.m_pool_state = ThreadsExec::Inactive ;
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
@ -187,13 +187,13 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the reduction and broadcast
|
||||
@ -229,13 +229,13 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the reduction and broadcast
|
||||
@ -264,7 +264,7 @@ public:
|
||||
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ;
|
||||
|
||||
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
|
||||
|
||||
Join::join( f , reduce_memory() , fan.reduce_memory() );
|
||||
}
|
||||
@ -280,7 +280,7 @@ public:
|
||||
const int rev_rank = m_pool_size - ( m_pool_rank + 1 );
|
||||
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
Impl::spinwait( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<<i)]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
}
|
||||
|
||||
@ -312,7 +312,7 @@ public:
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
|
||||
|
||||
// Wait: Active -> ReductionAvailable (or ScanAvailable)
|
||||
Impl::spinwait( fan.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active );
|
||||
Join::join( f , work_value , fan.reduce_memory() );
|
||||
}
|
||||
|
||||
@ -330,8 +330,8 @@ public:
|
||||
|
||||
// Wait: Active -> ReductionAvailable
|
||||
// Wait: ReductionAvailable -> ScanAvailable
|
||||
Impl::spinwait( th.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait( th.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
|
||||
Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count );
|
||||
}
|
||||
@ -342,7 +342,7 @@ public:
|
||||
|
||||
// Wait for all threads to complete inclusive scan
|
||||
// Wait: ScanAvailable -> Rendezvous
|
||||
Impl::spinwait( m_pool_state , ThreadsExec::ScanAvailable );
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable );
|
||||
}
|
||||
|
||||
//--------------------------------
|
||||
@ -350,7 +350,7 @@ public:
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
ThreadsExec & fan = *m_pool_base[ rev_rank + (1<<i) ];
|
||||
// Wait: ReductionAvailable -> ScanAvailable
|
||||
Impl::spinwait( fan.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable );
|
||||
// Set: ScanAvailable -> Rendezvous
|
||||
fan.m_pool_state = ThreadsExec::Rendezvous ;
|
||||
}
|
||||
@ -377,13 +377,13 @@ public:
|
||||
// Wait for all threads to copy previous thread's inclusive scan value
|
||||
// Wait for all threads: Rendezvous -> ScanCompleted
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
if ( rev_rank ) {
|
||||
// Set: ScanAvailable -> ScanCompleted
|
||||
m_pool_state = ThreadsExec::ScanCompleted ;
|
||||
// Wait: ScanCompleted -> Active
|
||||
Impl::spinwait( m_pool_state , ThreadsExec::ScanCompleted );
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted );
|
||||
}
|
||||
// Set: ScanCompleted -> Active
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
@ -410,7 +410,7 @@ public:
|
||||
// Fan-in reduction with highest ranking thread as the root
|
||||
for ( int i = 0 ; i < m_pool_fan_size ; ++i ) {
|
||||
// Wait: Active -> Rendezvous
|
||||
Impl::spinwait( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<<i) ]->m_pool_state , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; }
|
||||
@ -418,7 +418,7 @@ public:
|
||||
if ( rev_rank ) {
|
||||
m_pool_state = ThreadsExec::Rendezvous ;
|
||||
// Wait: Rendezvous -> Active
|
||||
Impl::spinwait( m_pool_state , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous );
|
||||
}
|
||||
else {
|
||||
// Root thread does the thread-scan before releasing threads
|
||||
|
||||
@ -49,6 +49,7 @@
|
||||
#include <utility>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
#include <impl/Kokkos_FunctorAdapter.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
|
||||
@ -103,13 +104,13 @@ public:
|
||||
|
||||
// Wait for fan-in threads
|
||||
for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) {
|
||||
Impl::spinwait( m_team_base[j]->state() , ThreadsExec::Active );
|
||||
Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active );
|
||||
}
|
||||
|
||||
// If not root then wait for release
|
||||
if ( m_team_rank_rev ) {
|
||||
m_exec->state() = ThreadsExec::Rendezvous ;
|
||||
Impl::spinwait( m_exec->state() , ThreadsExec::Rendezvous );
|
||||
Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous );
|
||||
}
|
||||
|
||||
return ! m_team_rank_rev ;
|
||||
@ -350,6 +351,10 @@ public:
|
||||
const int team_rank_rev = pool_rank_rev % team.team_alloc();
|
||||
const size_t pool_league_size = m_exec->pool_size() / team.team_alloc() ;
|
||||
const size_t pool_league_rank_rev = pool_rank_rev / team.team_alloc() ;
|
||||
if(pool_league_rank_rev >= pool_league_size) {
|
||||
m_invalid_thread = 1;
|
||||
return;
|
||||
}
|
||||
const size_t pool_league_rank = pool_league_size - ( pool_league_rank_rev + 1 );
|
||||
|
||||
const int pool_num_teams = m_exec->pool_size()/team.team_alloc();
|
||||
@ -505,7 +510,8 @@ private:
|
||||
, const int team_size_request )
|
||||
{
|
||||
const int pool_size = traits::execution_space::thread_pool_size(0);
|
||||
const int team_max = traits::execution_space::thread_pool_size(1);
|
||||
const int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
const int team_max = pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
const int team_grain = traits::execution_space::thread_pool_size(2);
|
||||
|
||||
m_league_size = league_size_request ;
|
||||
@ -552,8 +558,12 @@ public:
|
||||
|
||||
template< class FunctorType >
|
||||
inline static
|
||||
int team_size_max( const FunctorType & )
|
||||
{ return traits::execution_space::thread_pool_size(1); }
|
||||
int team_size_max( const FunctorType & ) {
|
||||
int pool_size = traits::execution_space::thread_pool_size(1);
|
||||
int max_host_team_size = Impl::HostThreadTeamData::max_team_members;
|
||||
return pool_size<max_host_team_size?pool_size:max_host_team_size;
|
||||
}
|
||||
|
||||
|
||||
template< class FunctorType >
|
||||
static int team_size_recommended( const FunctorType & )
|
||||
@ -819,9 +829,7 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
result+=tmp;
|
||||
lambda(i,result);
|
||||
}
|
||||
}
|
||||
|
||||
@ -835,18 +843,14 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::T
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::ThreadsExecTeamMember >&
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& init_result) {
|
||||
loop_boundaries, const Lambda & lambda, const JoinType& join, ValueType& result ) {
|
||||
|
||||
ValueType result = init_result;
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
lambda(i,result);
|
||||
}
|
||||
init_result = result;
|
||||
}
|
||||
|
||||
/** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final)
|
||||
|
||||
2356
lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
Normal file
2356
lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -56,12 +56,13 @@ int bit_scan_forward( unsigned i )
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __ffs(i) - 1;
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_ffs(i) - 1;
|
||||
#elif defined( __INTEL_COMPILER )
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return _bit_scan_forward(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __cnttz4(i);
|
||||
#elif defined( KOKKOS_COMPILER_GNU ) || defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_ffs(i) - 1;
|
||||
#else
|
||||
|
||||
unsigned t = 1u;
|
||||
int r = 0;
|
||||
while ( i && ( i & t == 0 ) )
|
||||
@ -79,10 +80,12 @@ int bit_scan_reverse( unsigned i )
|
||||
enum { shift = static_cast<int>( sizeof(unsigned) * CHAR_BIT - 1 ) };
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return shift - __clz(i);
|
||||
#elif defined( KOKKOS_COMPILER_INTEL )
|
||||
return _bit_scan_reverse(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return shift - __cntlz4(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return shift - __builtin_clz(i);
|
||||
#elif defined( __INTEL_COMPILER )
|
||||
return _bit_scan_reverse(i);
|
||||
#else
|
||||
unsigned t = 1u << shift;
|
||||
int r = 0;
|
||||
@ -101,10 +104,12 @@ int bit_count( unsigned i )
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
return __popc(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_popcount(i);
|
||||
#elif defined ( __INTEL_COMPILER )
|
||||
return _popcnt32(i);
|
||||
#elif defined( KOKKOS_COMPILER_IBM )
|
||||
return __popcnt4(i);
|
||||
#elif defined( __GNUC__ ) || defined( __GNUG__ )
|
||||
return __builtin_popcount(i);
|
||||
#else
|
||||
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
|
||||
i = i - ( ( i >> 1 ) & ~0u / 3u ); // temp
|
||||
|
||||
@ -147,7 +147,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
@ -155,7 +155,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0);
|
||||
void finalize_internal( const bool all_spaces = false )
|
||||
{
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
|
||||
@ -449,5 +449,323 @@ void fence()
|
||||
Impl::fence_internal();
|
||||
}
|
||||
|
||||
void print_configuration( std::ostream & out , const bool detail )
|
||||
{
|
||||
std::ostringstream msg;
|
||||
|
||||
msg << "Compiler:" << std::endl;
|
||||
#ifdef KOKKOS_COMPILER_APPLECC
|
||||
msg << " KOKKOS_COMPILER_APPLECC: " << KOKKOS_COMPILER_APPLECC << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_CLANG
|
||||
msg << " KOKKOS_COMPILER_CLANG: " << KOKKOS_COMPILER_CLANG << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_CRAYC
|
||||
msg << " KOKKOS_COMPILER_CRAYC: " << KOKKOS_COMPILER_CRAYC << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_GNU
|
||||
msg << " KOKKOS_COMPILER_GNU: " << KOKKOS_COMPILER_GNU << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_IBM
|
||||
msg << " KOKKOS_COMPILER_IBM: " << KOKKOS_COMPILER_IBM << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_INTEL
|
||||
msg << " KOKKOS_COMPILER_INTEL: " << KOKKOS_COMPILER_INTEL << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_NVCC
|
||||
msg << " KOKKOS_COMPILER_NVCC: " << KOKKOS_COMPILER_NVCC << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_COMPILER_PGI
|
||||
msg << " KOKKOS_COMPILER_PGI: " << KOKKOS_COMPILER_PGI << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Architecture:" << std::endl;
|
||||
#ifdef KOKKOS_ENABLE_ISA_KNC
|
||||
msg << " KOKKOS_ENABLE_ISA_KNC: yes" << std::endl;
|
||||
#else
|
||||
msg << " KOKKOS_ENABLE_ISA_KNC: no" << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_ISA_POWERPCLE
|
||||
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: yes" << std::endl;
|
||||
#else
|
||||
msg << " KOKKOS_ENABLE_ISA_POWERPCLE: no" << std::endl;
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_ISA_X86_64
|
||||
msg << " KOKKOS_ENABLE_ISA_X86_64: yes" << std::endl;
|
||||
#else
|
||||
msg << " KOKKOS_ENABLE_ISA_X86_64: no" << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Devices:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_CUDA: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_OPENMP: ";
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_PTHREAD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_STDTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_STDTHREAD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_WINTHREAD: ";
|
||||
#ifdef KOKKOS_ENABLE_WINTHREAD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_QTHREADS: ";
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_SERIAL: ";
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Default Device:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA: ";
|
||||
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_CUDA
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP: ";
|
||||
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS: ";
|
||||
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_THREADS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS: ";
|
||||
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_QTHREADS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL: ";
|
||||
#ifdef KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SERIAL
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Atomics:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_CUDA_ATOMICS: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA_ATOMICS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_GNU_ATOMICS: ";
|
||||
#ifdef KOKKOS_ENABLE_GNU_ATOMICS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_INTEL_ATOMICS: ";
|
||||
#ifdef KOKKOS_ENABLE_INTEL_ATOMICS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_OPENMP_ATOMICS: ";
|
||||
#ifdef KOKKOS_ENABLE_OPENMP_ATOMICS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_WINDOWS_ATOMICS: ";
|
||||
#ifdef KOKKOS_ENABLE_WINDOWS_ATOMICS
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Vectorization:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_PRAGMA_IVDEP: ";
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PRAGMA_LOOPCOUNT: ";
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_LOOPCOUNT
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PRAGMA_SIMD: ";
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_SIMD
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PRAGMA_UNROLL: ";
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_UNROLL
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PRAGMA_VECTOR: ";
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_VECTOR
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
msg << "Memory:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_HBWSPACE: ";
|
||||
#ifdef KOKKOS_ENABLE_HBWSPACE
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_INTEL_MM_ALLOC: ";
|
||||
#ifdef KOKKOS_ENABLE_INTEL_MM_ALLOC
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_POSIX_MEMALIGN: ";
|
||||
#ifdef KOKKOS_ENABLE_POSIX_MEMALIGN
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
|
||||
msg << "Options:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_ASM: ";
|
||||
#ifdef KOKKOS_ENABLE_ASM
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CXX1Z: ";
|
||||
#ifdef KOKKOS_ENABLE_CXX1Z
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK: ";
|
||||
#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_HWLOC: ";
|
||||
#ifdef KOKKOS_ENABLE_HWLOC
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_LIBRT: ";
|
||||
#ifdef KOKKOS_ENABLE_LIBRT
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_MPI: ";
|
||||
#ifdef KOKKOS_ENABLE_MPI
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_PROFILING: ";
|
||||
#ifdef KOKKOS_ENABLE_PROFILING
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
msg << "Cuda Options:" << std::endl;
|
||||
msg << " KOKKOS_ENABLE_CUDA_LAMBDA: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA_LAMBDA
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CUDA_UVM: ";
|
||||
#ifdef KOKKOS_ENABLE_CUDA_UVM
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CUSPARSE: ";
|
||||
#ifdef KOKKOS_ENABLE_CUSPARSE
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: ";
|
||||
#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA
|
||||
msg << "yes" << std::endl;
|
||||
#else
|
||||
msg << "no" << std::endl;
|
||||
#endif
|
||||
|
||||
#endif
|
||||
|
||||
msg << "\nRuntime Configuration:" << std::endl;
|
||||
#ifdef KOKKOS_ENABLE_CUDA
|
||||
Cuda::print_configuration(msg, detail);
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_OPENMP
|
||||
OpenMP::print_configuration(msg, detail);
|
||||
#endif
|
||||
#if defined( KOKKOS_ENABLE_PTHREAD ) || defined( WINTHREAD )
|
||||
Threads::print_configuration(msg, detail);
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_QTHREADS
|
||||
Qthreads::print_configuration(msg, detail);
|
||||
#endif
|
||||
#ifdef KOKKOS_ENABLE_SERIAL
|
||||
Serial::print_configuration(msg, detail);
|
||||
#endif
|
||||
|
||||
out << msg.str() << std::endl;
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
|
||||
653
lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
Normal file
653
lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp
Normal file
@ -0,0 +1,653 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_FUNCTORANALYSIS_HPP
|
||||
#define KOKKOS_FUNCTORANALYSIS_HPP
|
||||
|
||||
#include <cstddef>
|
||||
#include <Kokkos_Core_fwd.hpp>
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
#include <impl/Kokkos_Tags.hpp>
|
||||
#include <impl/Kokkos_Reducer.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
struct FunctorPatternInterface {
|
||||
struct FOR {};
|
||||
struct REDUCE {};
|
||||
struct SCAN {};
|
||||
};
|
||||
|
||||
/** \brief Query Functor and execution policy argument tag for value type.
|
||||
*
|
||||
* If 'value_type' is not explicitly declared in the functor
|
||||
* then attempt to deduce the type from FunctorType::operator()
|
||||
* interface used by the pattern and policy.
|
||||
*
|
||||
* For the REDUCE pattern generate a Reducer and finalization function
|
||||
* derived from what is available within the functor.
|
||||
*/
|
||||
template< typename PatternInterface , class Policy , class Functor >
|
||||
struct FunctorAnalysis {
|
||||
private:
|
||||
|
||||
using FOR = FunctorPatternInterface::FOR ;
|
||||
using REDUCE = FunctorPatternInterface::REDUCE ;
|
||||
using SCAN = FunctorPatternInterface::SCAN ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
struct VOID {};
|
||||
|
||||
template< typename P = Policy , typename = std::false_type >
|
||||
struct has_work_tag
|
||||
{
|
||||
using type = void ;
|
||||
using wtag = VOID ;
|
||||
};
|
||||
|
||||
template< typename P >
|
||||
struct has_work_tag
|
||||
< P , typename std::is_same< typename P::work_tag , void >::type >
|
||||
{
|
||||
using type = typename P::work_tag ;
|
||||
using wtag = typename P::work_tag ;
|
||||
};
|
||||
|
||||
using Tag = typename has_work_tag<>::type ;
|
||||
using WTag = typename has_work_tag<>::wtag ;
|
||||
|
||||
//----------------------------------------
|
||||
// Check for Functor::value_type, which is either a simple type T or T[]
|
||||
|
||||
template< typename F , typename = std::false_type >
|
||||
struct has_value_type { using type = void ; };
|
||||
|
||||
template< typename F >
|
||||
struct has_value_type
|
||||
< F , typename std::is_same< typename F::value_type , void >::type >
|
||||
{
|
||||
using type = typename F::value_type ;
|
||||
|
||||
static_assert( ! std::is_reference< type >::value &&
|
||||
std::rank< type >::value <= 1 &&
|
||||
std::extent< type >::value == 0
|
||||
, "Kokkos Functor::value_type is T or T[]" );
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
// If Functor::value_type does not exist then evaluate operator(),
|
||||
// depending upon the pattern and whether the policy has a work tag,
|
||||
// to determine the reduction or scan value_type.
|
||||
|
||||
template< typename F
|
||||
, typename P = PatternInterface
|
||||
, typename V = typename has_value_type<F>::type
|
||||
, bool T = std::is_same< Tag , void >::value
|
||||
>
|
||||
struct deduce_value_type { using type = V ; };
|
||||
|
||||
template< typename F >
|
||||
struct deduce_value_type< F , REDUCE , void , true > {
|
||||
|
||||
template< typename M , typename A >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( M , A & ) const );
|
||||
|
||||
using type = decltype( deduce( & F::operator() ) );
|
||||
};
|
||||
|
||||
template< typename F >
|
||||
struct deduce_value_type< F , REDUCE , void , false > {
|
||||
|
||||
template< typename M , typename A >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( WTag , M , A & ) const );
|
||||
|
||||
template< typename M , typename A >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( WTag const & , M , A & ) const );
|
||||
|
||||
using type = decltype( deduce( & F::operator() ) );
|
||||
};
|
||||
|
||||
template< typename F >
|
||||
struct deduce_value_type< F , SCAN , void , true > {
|
||||
|
||||
template< typename M , typename A , typename I >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( M , A & , I ) const );
|
||||
|
||||
using type = decltype( deduce( & F::operator() ) );
|
||||
};
|
||||
|
||||
template< typename F >
|
||||
struct deduce_value_type< F , SCAN , void , false > {
|
||||
|
||||
template< typename M , typename A , typename I >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( WTag , M , A & , I ) const );
|
||||
|
||||
template< typename M , typename A , typename I >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
A deduce( void (Functor::*)( WTag const & , M , A & , I ) const );
|
||||
|
||||
using type = decltype( deduce( & F::operator() ) );
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
using candidate_type = typename deduce_value_type< Functor >::type ;
|
||||
|
||||
enum { candidate_is_void = std::is_same< candidate_type , void >::value
|
||||
, candidate_is_array = std::rank< candidate_type >::value == 1 };
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
using value_type = typename std::remove_extent< candidate_type >::type ;
|
||||
|
||||
static_assert( ! std::is_const< value_type >::value
|
||||
, "Kokkos functor operator reduce argument cannot be const" );
|
||||
|
||||
private:
|
||||
|
||||
// Stub to avoid defining a type 'void &'
|
||||
using ValueType = typename
|
||||
std::conditional< candidate_is_void , VOID , value_type >::type ;
|
||||
|
||||
public:
|
||||
|
||||
using pointer_type = typename
|
||||
std::conditional< candidate_is_void , void , ValueType * >::type ;
|
||||
|
||||
using reference_type = typename
|
||||
std::conditional< candidate_is_array , ValueType * , typename
|
||||
std::conditional< ! candidate_is_void , ValueType & , void >
|
||||
::type >::type ;
|
||||
|
||||
private:
|
||||
|
||||
template< bool IsArray , class FF >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< IsArray , unsigned >::type
|
||||
get_length( FF const & f ) { return f.value_count ; }
|
||||
|
||||
template< bool IsArray , class FF >
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
typename std::enable_if< ! IsArray , unsigned >::type
|
||||
get_length( FF const & ) { return 1 ; }
|
||||
|
||||
public:
|
||||
|
||||
enum { StaticValueSize = ! candidate_is_void &&
|
||||
! candidate_is_array
|
||||
? sizeof(ValueType) : 0 };
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
unsigned value_count( const Functor & f )
|
||||
{ return FunctorAnalysis::template get_length< candidate_is_array >(f); }
|
||||
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
unsigned value_size( const Functor & f )
|
||||
{ return FunctorAnalysis::template get_length< candidate_is_array >(f) * sizeof(ValueType); }
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class Unknown >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
unsigned value_count( const Unknown & )
|
||||
{ return 1 ; }
|
||||
|
||||
template< class Unknown >
|
||||
KOKKOS_FORCEINLINE_FUNCTION static
|
||||
unsigned value_size( const Unknown & )
|
||||
{ return sizeof(ValueType); }
|
||||
|
||||
private:
|
||||
|
||||
enum INTERFACE : int
|
||||
{ DISABLE = 0
|
||||
, NO_TAG_NOT_ARRAY = 1
|
||||
, NO_TAG_IS_ARRAY = 2
|
||||
, HAS_TAG_NOT_ARRAY = 3
|
||||
, HAS_TAG_IS_ARRAY = 4
|
||||
, DEDUCED =
|
||||
! std::is_same< PatternInterface , REDUCE >::value ? DISABLE : (
|
||||
std::is_same<Tag,void>::value
|
||||
? (candidate_is_array ? NO_TAG_IS_ARRAY : NO_TAG_NOT_ARRAY)
|
||||
: (candidate_is_array ? HAS_TAG_IS_ARRAY : HAS_TAG_NOT_ARRAY) )
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
// parallel_reduce join operator
|
||||
|
||||
template< class F , INTERFACE >
|
||||
struct has_join_function ;
|
||||
|
||||
template< class F >
|
||||
struct has_join_function< F , NO_TAG_NOT_ARRAY >
|
||||
{
|
||||
typedef volatile ValueType & vref_type ;
|
||||
typedef volatile const ValueType & cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( *dst , *src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_join_function< F , NO_TAG_IS_ARRAY >
|
||||
{
|
||||
typedef volatile ValueType * vref_type ;
|
||||
typedef volatile const ValueType * cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( dst , src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_join_function< F , HAS_TAG_NOT_ARRAY >
|
||||
{
|
||||
typedef volatile ValueType & vref_type ;
|
||||
typedef volatile const ValueType & cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( WTag() , *dst , *src ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_join_function< F , HAS_TAG_IS_ARRAY >
|
||||
{
|
||||
typedef volatile ValueType * vref_type ;
|
||||
typedef volatile const ValueType * cvref_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , vref_type , cvref_type ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{ f.join( WTag() , dst , src ); }
|
||||
};
|
||||
|
||||
|
||||
template< class F = Functor
|
||||
, INTERFACE = DEDUCED
|
||||
, typename = void >
|
||||
struct DeduceJoin
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const & f
|
||||
, ValueType volatile * dst
|
||||
, ValueType volatile const * src )
|
||||
{
|
||||
const int n = FunctorAnalysis::value_count( f );
|
||||
for ( int i = 0 ; i < n ; ++i ) dst[i] += src[i];
|
||||
}
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct DeduceJoin< F , DISABLE , void >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( F const &
|
||||
, ValueType volatile *
|
||||
, ValueType volatile const * ) {}
|
||||
};
|
||||
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceJoin< F , I ,
|
||||
decltype( has_join_function<F,I>::enable_if( & F::join ) ) >
|
||||
: public has_join_function<F,I> {};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
template< class , INTERFACE >
|
||||
struct has_init_function ;
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , NO_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( *dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , NO_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , HAS_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( WTag(), *dst ); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct has_init_function< F , HAS_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & f , ValueType * dst )
|
||||
{ f.init( WTag(), dst ); }
|
||||
};
|
||||
|
||||
template< class F = Functor
|
||||
, INTERFACE = DEDUCED
|
||||
, typename = void >
|
||||
struct DeduceInit
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & , ValueType * dst ) { new(dst) ValueType(); }
|
||||
};
|
||||
|
||||
template< class F >
|
||||
struct DeduceInit< F , DISABLE , void >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( F const & , ValueType * ) {}
|
||||
};
|
||||
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceInit< F , I ,
|
||||
decltype( has_init_function<F,I>::enable_if( & F::init ) ) >
|
||||
: public has_init_function<F,I> {};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
public:
|
||||
|
||||
struct Reducer
|
||||
{
|
||||
private:
|
||||
|
||||
Functor const & m_functor ;
|
||||
ValueType * const m_result ;
|
||||
int const m_length ;
|
||||
|
||||
public:
|
||||
|
||||
using reducer = Reducer ;
|
||||
using value_type = FunctorAnalysis::value_type ;
|
||||
using memory_space = void ;
|
||||
using reference_type = FunctorAnalysis::reference_type ;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join( ValueType volatile * dst
|
||||
, ValueType volatile const * src ) const noexcept
|
||||
{ DeduceJoin<>::join( m_functor , dst , src ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void init( ValueType * dst ) const noexcept
|
||||
{ DeduceInit<>::init( m_functor , dst ); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
constexpr Reducer( Functor const & arg_functor
|
||||
, ValueType * arg_value = 0
|
||||
, int arg_length = 0 ) noexcept
|
||||
: m_functor( arg_functor ), m_result(arg_value), m_length(arg_length) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr int length() const noexcept { return m_length ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ValueType & operator[]( int i ) const noexcept
|
||||
{ return m_result[i]; }
|
||||
|
||||
private:
|
||||
|
||||
template< bool IsArray >
|
||||
constexpr
|
||||
typename std::enable_if< IsArray , ValueType * >::type
|
||||
ref() const noexcept { return m_result ; }
|
||||
|
||||
template< bool IsArray >
|
||||
constexpr
|
||||
typename std::enable_if< ! IsArray , ValueType & >::type
|
||||
ref() const noexcept { return *m_result ; }
|
||||
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
auto result() const noexcept
|
||||
-> decltype( Reducer::template ref< candidate_is_array >() )
|
||||
{ return Reducer::template ref< candidate_is_array >(); }
|
||||
};
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
private:
|
||||
|
||||
template< class , INTERFACE >
|
||||
struct has_final_function ;
|
||||
|
||||
// No tag, not array
|
||||
template< class F >
|
||||
struct has_final_function< F , NO_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( *dst ); }
|
||||
};
|
||||
|
||||
// No tag, is array
|
||||
template< class F >
|
||||
struct has_final_function< F , NO_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( dst ); }
|
||||
};
|
||||
|
||||
// Has tag, not array
|
||||
template< class F >
|
||||
struct has_final_function< F , HAS_TAG_NOT_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , ValueType & ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( WTag(), *dst ); }
|
||||
};
|
||||
|
||||
// Has tag, is array
|
||||
template< class F >
|
||||
struct has_final_function< F , HAS_TAG_IS_ARRAY >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (F::*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void enable_if( void (*)( WTag const & , ValueType * ) );
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void final( F const & f , ValueType * dst )
|
||||
{ f.final( WTag(), dst ); }
|
||||
};
|
||||
|
||||
template< class F = Functor
|
||||
, INTERFACE = DEDUCED
|
||||
, typename = void >
|
||||
struct DeduceFinal
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static void final( F const & , ValueType * ) {}
|
||||
};
|
||||
|
||||
template< class F , INTERFACE I >
|
||||
struct DeduceFinal< F , I ,
|
||||
decltype( has_final_function<F,I>::enable_if( & F::final ) ) >
|
||||
: public has_init_function<F,I> {};
|
||||
|
||||
public:
|
||||
|
||||
static void final( Functor const & f , ValueType * result )
|
||||
{ DeduceFinal<>::final( f , result ); }
|
||||
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* KOKKOS_FUNCTORANALYSIS_HPP */
|
||||
|
||||
@ -62,7 +62,7 @@
|
||||
#include <memkind.h>
|
||||
#endif
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
|
||||
@ -249,7 +249,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
SharedAllocationRecord< Kokkos::Experimental::HBWSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::Experimental::HBWSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
@ -278,7 +278,7 @@ SharedAllocationRecord( const Kokkos::Experimental::HBWSpace & arg_space
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
|
||||
#include <algorithm>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
#endif
|
||||
/*--------------------------------------------------------------------------*/
|
||||
@ -359,7 +359,7 @@ deallocate( SharedAllocationRecord< void , void > * arg_rec )
|
||||
SharedAllocationRecord< Kokkos::HostSpace , void >::
|
||||
~SharedAllocationRecord()
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::deallocateData(
|
||||
Kokkos::Profiling::SpaceHandle(Kokkos::HostSpace::name()),RecordBase::m_alloc_ptr->m_label,
|
||||
@ -388,7 +388,7 @@ SharedAllocationRecord( const Kokkos::HostSpace & arg_space
|
||||
)
|
||||
, m_space( arg_space )
|
||||
{
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
if(Kokkos::Profiling::profileLibraryLoaded()) {
|
||||
Kokkos::Profiling::allocateData(Kokkos::Profiling::SpaceHandle(arg_space.name()),arg_label,data(),arg_alloc_size);
|
||||
}
|
||||
|
||||
463
lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
Normal file
463
lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp
Normal file
@ -0,0 +1,463 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#include <limits>
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <impl/Kokkos_HostThreadTeam.hpp>
|
||||
#include <impl/Kokkos_Error.hpp>
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
void HostThreadTeamData::organize_pool
|
||||
( HostThreadTeamData * members[] , const int size )
|
||||
{
|
||||
bool ok = true ;
|
||||
|
||||
// Verify not already a member of a pool:
|
||||
for ( int rank = 0 ; rank < size && ok ; ++rank ) {
|
||||
ok = ( 0 != members[rank] ) && ( 0 == members[rank]->m_pool_scratch );
|
||||
}
|
||||
|
||||
if ( ok ) {
|
||||
|
||||
int64_t * const root_scratch = members[0]->m_scratch ;
|
||||
|
||||
for ( int i = m_pool_rendezvous ; i < m_pool_reduce ; ++i ) {
|
||||
root_scratch[i] = 0 ;
|
||||
}
|
||||
|
||||
{
|
||||
HostThreadTeamData ** const pool =
|
||||
(HostThreadTeamData **) (root_scratch + m_pool_members);
|
||||
|
||||
// team size == 1, league size == pool_size
|
||||
|
||||
for ( int rank = 0 ; rank < size ; ++rank ) {
|
||||
HostThreadTeamData * const mem = members[ rank ] ;
|
||||
mem->m_pool_scratch = root_scratch ;
|
||||
mem->m_team_scratch = mem->m_scratch ;
|
||||
mem->m_pool_rank = rank ;
|
||||
mem->m_pool_size = size ;
|
||||
mem->m_team_base = rank ;
|
||||
mem->m_team_rank = 0 ;
|
||||
mem->m_team_size = 1 ;
|
||||
mem->m_team_alloc = 1 ;
|
||||
mem->m_league_rank = rank ;
|
||||
mem->m_league_size = size ;
|
||||
mem->m_pool_rendezvous_step = 0 ;
|
||||
mem->m_team_rendezvous_step = 0 ;
|
||||
pool[ rank ] = mem ;
|
||||
}
|
||||
}
|
||||
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_pool ERROR pool already exists");
|
||||
}
|
||||
}
|
||||
|
||||
void HostThreadTeamData::disband_pool()
|
||||
{
|
||||
m_work_range.first = -1 ;
|
||||
m_work_range.second = -1 ;
|
||||
m_pool_scratch = 0 ;
|
||||
m_team_scratch = 0 ;
|
||||
m_pool_rank = 0 ;
|
||||
m_pool_size = 1 ;
|
||||
m_team_base = 0 ;
|
||||
m_team_rank = 0 ;
|
||||
m_team_size = 1 ;
|
||||
m_team_alloc = 1 ;
|
||||
m_league_rank = 0 ;
|
||||
m_league_size = 1 ;
|
||||
m_pool_rendezvous_step = 0 ;
|
||||
m_team_rendezvous_step = 0 ;
|
||||
}
|
||||
|
||||
int HostThreadTeamData::organize_team( const int team_size )
|
||||
{
|
||||
// Pool is initialized
|
||||
const bool ok_pool = 0 != m_pool_scratch ;
|
||||
|
||||
// Team is not set
|
||||
const bool ok_team =
|
||||
m_team_scratch == m_scratch &&
|
||||
m_team_base == m_pool_rank &&
|
||||
m_team_rank == 0 &&
|
||||
m_team_size == 1 &&
|
||||
m_team_alloc == 1 &&
|
||||
m_league_rank == m_pool_rank &&
|
||||
m_league_size == m_pool_size ;
|
||||
|
||||
if ( ok_pool && ok_team ) {
|
||||
|
||||
if ( team_size <= 0 ) return 0 ; // No teams to organize
|
||||
|
||||
if ( team_size == 1 ) return 1 ; // Already organized in teams of one
|
||||
|
||||
HostThreadTeamData * const * const pool =
|
||||
(HostThreadTeamData **) (m_pool_scratch + m_pool_members);
|
||||
|
||||
// "league_size" in this context is the number of concurrent teams
|
||||
// that the pool can accommodate. Excess threads are idle.
|
||||
const int league_size = m_pool_size / team_size ;
|
||||
const int team_alloc_size = m_pool_size / league_size ;
|
||||
const int team_alloc_rank = m_pool_rank % team_alloc_size ;
|
||||
const int league_rank = m_pool_rank / team_alloc_size ;
|
||||
const int team_base_rank = league_rank * team_alloc_size ;
|
||||
|
||||
m_team_scratch = pool[ team_base_rank ]->m_scratch ;
|
||||
m_team_base = team_base_rank ;
|
||||
// This needs to check overflow, if m_pool_size % team_alloc_size !=0
|
||||
// there are two corner cases:
|
||||
// (i) if team_alloc_size == team_size there might be a non-full
|
||||
// zombi team around (for example m_pool_size = 5 and team_size = 2
|
||||
// (ii) if team_alloc > team_size then the last team might have less
|
||||
// threads than the others
|
||||
m_team_rank = ( team_base_rank + team_size <= m_pool_size ) &&
|
||||
( team_alloc_rank < team_size ) ?
|
||||
team_alloc_rank : -1;
|
||||
m_team_size = team_size ;
|
||||
m_team_alloc = team_alloc_size ;
|
||||
m_league_rank = league_rank ;
|
||||
m_league_size = league_size ;
|
||||
m_team_rendezvous_step = 0 ;
|
||||
|
||||
if ( team_base_rank == m_pool_rank ) {
|
||||
// Initialize team's rendezvous memory
|
||||
for ( int i = m_team_rendezvous ; i < m_pool_reduce ; ++i ) {
|
||||
m_scratch[i] = 0 ;
|
||||
}
|
||||
// Make sure team's rendezvous memory initialized
|
||||
// is written before proceeding.
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
// Organizing threads into a team performs a barrier across the
|
||||
// entire pool to insure proper initialization of the team
|
||||
// rendezvous mechanism before a team rendezvous can be performed.
|
||||
|
||||
if ( pool_rendezvous() ) {
|
||||
pool_rendezvous_release();
|
||||
}
|
||||
}
|
||||
else {
|
||||
Kokkos::Impl::throw_runtime_exception("Kokkos::Impl::HostThreadTeamData::organize_team ERROR");
|
||||
}
|
||||
|
||||
return 0 <= m_team_rank ;
|
||||
}
|
||||
|
||||
void HostThreadTeamData::disband_team()
|
||||
{
|
||||
m_team_scratch = m_scratch ;
|
||||
m_team_base = m_pool_rank ;
|
||||
m_team_rank = 0 ;
|
||||
m_team_size = 1 ;
|
||||
m_team_alloc = 1 ;
|
||||
m_league_rank = m_pool_rank ;
|
||||
m_league_size = m_pool_size ;
|
||||
m_team_rendezvous_step = 0 ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* pattern for rendezvous
|
||||
*
|
||||
* if ( rendezvous() ) {
|
||||
* ... all other threads are still in team_rendezvous() ...
|
||||
* rendezvous_release();
|
||||
* ... all other threads are released from team_rendezvous() ...
|
||||
* }
|
||||
*/
|
||||
|
||||
int HostThreadTeamData::rendezvous( int64_t * const buffer
|
||||
, int & rendezvous_step
|
||||
, int const size
|
||||
, int const rank ) noexcept
|
||||
{
|
||||
enum : int { shift_byte = 3 };
|
||||
enum : int { size_byte = ( 01 << shift_byte ) }; // == 8
|
||||
enum : int { mask_byte = size_byte - 1 };
|
||||
|
||||
enum : int { shift_mem_cycle = 2 };
|
||||
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
|
||||
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
|
||||
|
||||
// Cycle step values: 1 <= step <= size_val_cycle
|
||||
// An odd multiple of memory cycle so that when a memory location
|
||||
// is reused it has a different value.
|
||||
// Must be representable within a single byte: size_val_cycle < 16
|
||||
|
||||
enum : int { size_val_cycle = 3 * size_mem_cycle };
|
||||
|
||||
// Requires:
|
||||
// Called by rank = [ 0 .. size )
|
||||
// buffer aligned to int64_t[4]
|
||||
|
||||
// A sequence of rendezvous uses four cycled locations in memory
|
||||
// and non-equal cycled synchronization values to
|
||||
// 1) prevent rendezvous from overtaking one another and
|
||||
// 2) give each spin wait location an int64_t[4] span
|
||||
// so that it has its own cache line.
|
||||
|
||||
const int step = ( rendezvous_step % size_val_cycle ) + 1 ;
|
||||
|
||||
rendezvous_step = step ;
|
||||
|
||||
// The leading int64_t[4] span is for thread 0 to write
|
||||
// and all other threads to read spin-wait.
|
||||
// sync_offset is the index into this array for this step.
|
||||
|
||||
const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ;
|
||||
|
||||
union {
|
||||
int64_t full ;
|
||||
int8_t byte[8] ;
|
||||
} value ;
|
||||
|
||||
if ( rank ) {
|
||||
|
||||
const int group_begin = rank << shift_byte ; // == rank * size_byte
|
||||
|
||||
if ( group_begin < size ) {
|
||||
|
||||
// This thread waits for threads
|
||||
// [ group_begin .. group_begin + 8 )
|
||||
// [ rank*8 .. rank*8 + 8 )
|
||||
// to write to their designated bytes.
|
||||
|
||||
const int end = group_begin + size_byte < size
|
||||
? size_byte : size - group_begin ;
|
||||
|
||||
value.full = 0 ;
|
||||
for ( int i = 0 ; i < end ; ++i ) value.byte[i] = int8_t( step );
|
||||
|
||||
store_fence(); // This should not be needed but fixes #742
|
||||
|
||||
spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ]
|
||||
, value.full );
|
||||
}
|
||||
|
||||
{
|
||||
// This thread sets its designated byte.
|
||||
// ( rank % size_byte ) +
|
||||
// ( ( rank / size_byte ) * size_byte * size_mem_cycle ) +
|
||||
// ( sync_offset * size_byte )
|
||||
const int offset = ( rank & mask_byte )
|
||||
+ ( ( rank & ~mask_byte ) << shift_mem_cycle )
|
||||
+ ( sync_offset << shift_byte );
|
||||
|
||||
// All of this thread's previous memory stores must be complete before
|
||||
// this thread stores the step value at this thread's designated byte
|
||||
// in the shared synchronization array.
|
||||
|
||||
Kokkos::memory_fence();
|
||||
|
||||
((volatile int8_t*) buffer)[ offset ] = int8_t( step );
|
||||
|
||||
// Memory fence to push the previous store out
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
// Wait for thread 0 to release all other threads
|
||||
|
||||
spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) );
|
||||
|
||||
}
|
||||
else {
|
||||
// Thread 0 waits for threads [1..7]
|
||||
// to write to their designated bytes.
|
||||
|
||||
const int end = size_byte < size ? 8 : size ;
|
||||
|
||||
value.full = 0 ;
|
||||
for ( int i = 1 ; i < end ; ++i ) value.byte[i] = int8_t( step );
|
||||
|
||||
spinwait_until_equal( buffer[ sync_offset ], value.full );
|
||||
}
|
||||
|
||||
return rank ? 0 : 1 ;
|
||||
}
|
||||
|
||||
void HostThreadTeamData::
|
||||
rendezvous_release( int64_t * const buffer
|
||||
, int const rendezvous_step ) noexcept
|
||||
{
|
||||
enum : int { shift_mem_cycle = 2 };
|
||||
enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4
|
||||
enum : int { mask_mem_cycle = size_mem_cycle - 1 };
|
||||
|
||||
// Requires:
|
||||
// Called after team_rendezvous
|
||||
// Called only by true == team_rendezvous(root)
|
||||
|
||||
// Memory fence to be sure all previous writes are complete:
|
||||
Kokkos::memory_fence();
|
||||
|
||||
((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] =
|
||||
int64_t( rendezvous_step );
|
||||
|
||||
// Memory fence to push the store out
|
||||
Kokkos::memory_fence();
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
int HostThreadTeamData::get_work_stealing() noexcept
|
||||
{
|
||||
pair_int_t w( -1 , -1 );
|
||||
|
||||
if ( 1 == m_team_size || team_rendezvous() ) {
|
||||
|
||||
// Attempt first from beginning of my work range
|
||||
for ( int attempt = m_work_range.first < m_work_range.second ; attempt ; ) {
|
||||
|
||||
// Query and attempt to update m_work_range
|
||||
// from: [ w.first , w.second )
|
||||
// to: [ w.first + 1 , w.second ) = w_new
|
||||
//
|
||||
// If w is invalid then is just a query.
|
||||
|
||||
const pair_int_t w_new( w.first + 1 , w.second );
|
||||
|
||||
w = Kokkos::atomic_compare_exchange( & m_work_range, w, w_new );
|
||||
|
||||
if ( w.first < w.second ) {
|
||||
// m_work_range is viable
|
||||
|
||||
// If steal is successful then don't repeat attempt to steal
|
||||
attempt = ! ( w_new.first == w.first + 1 &&
|
||||
w_new.second == w.second );
|
||||
}
|
||||
else {
|
||||
// m_work_range is not viable
|
||||
w.first = -1 ;
|
||||
w.second = -1 ;
|
||||
|
||||
attempt = 0 ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( w.first == -1 && m_steal_rank != m_pool_rank ) {
|
||||
|
||||
HostThreadTeamData * const * const pool =
|
||||
(HostThreadTeamData**)( m_pool_scratch + m_pool_members );
|
||||
|
||||
// Attempt from begining failed, try to steal from end of neighbor
|
||||
|
||||
pair_int_t volatile * steal_range =
|
||||
& ( pool[ m_steal_rank ]->m_work_range );
|
||||
|
||||
for ( int attempt = true ; attempt ; ) {
|
||||
|
||||
// Query and attempt to update steal_work_range
|
||||
// from: [ w.first , w.second )
|
||||
// to: [ w.first , w.second - 1 ) = w_new
|
||||
//
|
||||
// If w is invalid then is just a query.
|
||||
|
||||
const pair_int_t w_new( w.first , w.second - 1 );
|
||||
|
||||
w = Kokkos::atomic_compare_exchange( steal_range, w, w_new );
|
||||
|
||||
if ( w.first < w.second ) {
|
||||
// steal_work_range is viable
|
||||
|
||||
// If steal is successful then don't repeat attempt to steal
|
||||
attempt = ! ( w_new.first == w.first &&
|
||||
w_new.second == w.second - 1 );
|
||||
}
|
||||
else {
|
||||
// steal_work_range is not viable, move to next member
|
||||
w.first = -1 ;
|
||||
w.second = -1 ;
|
||||
|
||||
// We need to figure out whether the next team is active
|
||||
// m_steal_rank + m_team_alloc could be the next base_rank to steal from
|
||||
// but only if there are another m_team_size threads available so that that
|
||||
// base rank has a full team.
|
||||
m_steal_rank = m_steal_rank + m_team_alloc + m_team_size <= m_pool_size ?
|
||||
m_steal_rank + m_team_alloc : 0;
|
||||
|
||||
steal_range = & ( pool[ m_steal_rank ]->m_work_range );
|
||||
|
||||
// If tried all other members then don't repeat attempt to steal
|
||||
attempt = m_steal_rank != m_pool_rank ;
|
||||
}
|
||||
}
|
||||
|
||||
if ( w.first != -1 ) w.first = w.second - 1 ;
|
||||
}
|
||||
|
||||
if ( 1 < m_team_size ) {
|
||||
// Must share the work index
|
||||
*((int volatile *) team_reduce()) = w.first ;
|
||||
|
||||
team_rendezvous_release();
|
||||
}
|
||||
}
|
||||
else if ( 1 < m_team_size ) {
|
||||
w.first = *((int volatile *) team_reduce());
|
||||
}
|
||||
|
||||
// May exit because successfully stole work and w is good.
|
||||
// May exit because no work left to steal and w = (-1,-1).
|
||||
|
||||
#if 0
|
||||
fprintf(stdout,"HostThreadTeamData::get_work_stealing() pool(%d of %d) %d\n"
|
||||
, m_pool_rank , m_pool_size , w.first );
|
||||
fflush(stdout);
|
||||
#endif
|
||||
|
||||
return w.first ;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
1090
lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
Normal file
1090
lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp
Normal file
File diff suppressed because it is too large
Load Diff
@ -52,6 +52,10 @@ void memory_fence()
|
||||
{
|
||||
#if defined( __CUDA_ARCH__ )
|
||||
__threadfence();
|
||||
#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 )
|
||||
asm volatile (
|
||||
"mfence" ::: "memory"
|
||||
);
|
||||
#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \
|
||||
( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) )
|
||||
__sync_synchronize();
|
||||
|
||||
@ -129,8 +129,8 @@
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_HAVE_CUDA_RDC
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RDC
|
||||
#define KOKKOS_ENABLE_CUDA_RDC KOKKOS_HAVE_CUDA_RDC
|
||||
#ifndef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE
|
||||
#define KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE KOKKOS_HAVE_CUDA_RDC
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -242,9 +242,9 @@
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef KOKKOS_HAVE_QTHREAD
|
||||
#ifndef KOKKOS_ENABLE_QTHREAD
|
||||
#define KOKKOS_ENABLE_QTHREAD KOKKOS_HAVE_QTHREAD
|
||||
#ifdef KOKKOS_HAVE_QTHREADS
|
||||
#ifndef KOKKOS_ENABLE_QTHREADS
|
||||
#define KOKKOS_ENABLE_QTHREADS KOKKOS_HAVE_QTHREADS
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
@ -43,7 +43,7 @@
|
||||
|
||||
#include <impl/Kokkos_Profiling_Interface.hpp>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <string.h>
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
@ -50,7 +50,7 @@
|
||||
#include <string>
|
||||
#include <cinttypes>
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
#include <impl/Kokkos_Profiling_DeviceInfo.hpp>
|
||||
#include <dlfcn.h>
|
||||
#include <iostream>
|
||||
@ -59,7 +59,7 @@
|
||||
|
||||
#define KOKKOSP_INTERFACE_VERSION 20150628
|
||||
|
||||
#if (KOKKOS_ENABLE_PROFILING)
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
namespace Kokkos {
|
||||
namespace Profiling {
|
||||
|
||||
|
||||
317
lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
Normal file
317
lib/kokkos/core/src/impl/Kokkos_Reducer.hpp
Normal file
@ -0,0 +1,317 @@
|
||||
/*
|
||||
//@HEADER
|
||||
// ************************************************************************
|
||||
//
|
||||
// Kokkos v. 2.0
|
||||
// Copyright (2014) Sandia Corporation
|
||||
//
|
||||
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
|
||||
// the U.S. Government retains certain rights in this software.
|
||||
//
|
||||
// Redistribution and use in source and binary forms, with or without
|
||||
// modification, are permitted provided that the following conditions are
|
||||
// met:
|
||||
//
|
||||
// 1. Redistributions of source code must retain the above copyright
|
||||
// notice, this list of conditions and the following disclaimer.
|
||||
//
|
||||
// 2. Redistributions in binary form must reproduce the above copyright
|
||||
// notice, this list of conditions and the following disclaimer in the
|
||||
// documentation and/or other materials provided with the distribution.
|
||||
//
|
||||
// 3. Neither the name of the Corporation nor the names of the
|
||||
// contributors may be used to endorse or promote products derived from
|
||||
// this software without specific prior written permission.
|
||||
//
|
||||
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
|
||||
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
||||
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
|
||||
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
//
|
||||
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
|
||||
//
|
||||
// ************************************************************************
|
||||
//@HEADER
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_IMPL_REDUCER_HPP
|
||||
#define KOKKOS_IMPL_REDUCER_HPP
|
||||
|
||||
#include <impl/Kokkos_Traits.hpp>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
/* Reducer abstraction:
|
||||
* 1) Provides 'join' operation
|
||||
* 2) Provides 'init' operation
|
||||
* 3) Provides 'copy' operation
|
||||
* 4) Optionally provides result value in a memory space
|
||||
*
|
||||
* Created from:
|
||||
* 1) Functor::operator()( destination , source )
|
||||
* 2) Functor::{ join , init )
|
||||
*/
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
template< typename value_type >
|
||||
struct ReduceSum
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void copy( value_type & dest
|
||||
, value_type const & src ) noexcept
|
||||
{ dest = src ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( value_type & dest ) noexcept
|
||||
{ new( &dest ) value_type(); }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( value_type volatile & dest
|
||||
, value_type const volatile & src ) noexcept
|
||||
{ dest += src ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( value_type & dest
|
||||
, value_type const & src ) noexcept
|
||||
{ dest += src ; }
|
||||
};
|
||||
|
||||
template< typename T
|
||||
, class ReduceOp = ReduceSum< T >
|
||||
, typename MemorySpace = void >
|
||||
struct Reducer
|
||||
: private ReduceOp
|
||||
, private integral_nonzero_constant
|
||||
< int , ( std::rank<T>::value == 1 ? std::extent<T>::value : 1 )>
|
||||
{
|
||||
private:
|
||||
|
||||
// Determine if T is simple array
|
||||
|
||||
enum : int { rank = std::rank<T>::value };
|
||||
|
||||
static_assert( rank <= 1 , "Kokkos::Impl::Reducer type is at most rank-one" );
|
||||
|
||||
using length_t =
|
||||
integral_nonzero_constant<int,( rank == 1 ? std::extent<T>::value : 1 )> ;
|
||||
|
||||
public:
|
||||
|
||||
using reducer = Reducer ;
|
||||
using memory_space = MemorySpace ;
|
||||
using value_type = typename std::remove_extent<T>::type ;
|
||||
using reference_type =
|
||||
typename std::conditional< ( rank != 0 )
|
||||
, value_type *
|
||||
, value_type &
|
||||
>::type ;
|
||||
private:
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
// Determine what functions 'ReduceOp' provides:
|
||||
// copy( destination , source )
|
||||
// init( destination )
|
||||
//
|
||||
// operator()( destination , source )
|
||||
// join( destination , source )
|
||||
//
|
||||
// Provide defaults for missing optional operations
|
||||
|
||||
template< class R , typename = void>
|
||||
struct COPY {
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void copy( R const &
|
||||
, value_type * dst
|
||||
, value_type const * src ) { *dst = *src ; }
|
||||
};
|
||||
|
||||
template< class R >
|
||||
struct COPY< R , decltype( ((R*)0)->copy( *((value_type*)0)
|
||||
, *((value_type const *)0) ) ) >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void copy( R const & r
|
||||
, value_type * dst
|
||||
, value_type const * src ) { r.copy( *dst , *src ); }
|
||||
};
|
||||
|
||||
template< class R , typename = void >
|
||||
struct INIT {
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( R const & , value_type * dst ) { new(dst) value_type(); }
|
||||
};
|
||||
|
||||
template< class R >
|
||||
struct INIT< R , decltype( ((R*)0)->init( *((value_type*)0 ) ) ) >
|
||||
{
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void init( R const & r , value_type * dst ) { r.init( *dst ); }
|
||||
};
|
||||
|
||||
template< class R , typename V , typename = void > struct JOIN
|
||||
{
|
||||
// If no join function then try operator()
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( R const & r , V * dst , V const * src )
|
||||
{ r.operator()(*dst,*src); }
|
||||
};
|
||||
|
||||
template< class R , typename V >
|
||||
struct JOIN< R , V , decltype( ((R*)0)->join ( *((V *)0) , *((V const *)0) ) ) >
|
||||
{
|
||||
// If has join function use it
|
||||
KOKKOS_INLINE_FUNCTION static
|
||||
void join( R const & r , V * dst , V const * src )
|
||||
{ r.join(*dst,*src); }
|
||||
};
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
value_type * const m_result ;
|
||||
|
||||
template< int Rank >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static constexpr
|
||||
typename std::enable_if< ( 0 != Rank ) , reference_type >::type
|
||||
ref( value_type * p ) noexcept { return p ; }
|
||||
|
||||
template< int Rank >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
static constexpr
|
||||
typename std::enable_if< ( 0 == Rank ) , reference_type >::type
|
||||
ref( value_type * p ) noexcept { return *p ; }
|
||||
|
||||
public:
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr int length() const noexcept
|
||||
{ return length_t::value ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
value_type * data() const noexcept
|
||||
{ return m_result ; }
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
reference_type reference() const noexcept
|
||||
{ return Reducer::template ref< rank >( m_result ); }
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void copy( value_type * const dest
|
||||
, value_type const * const src ) const noexcept
|
||||
{
|
||||
for ( int i = 0 ; i < length() ; ++i ) {
|
||||
Reducer::template COPY<ReduceOp>::copy( (ReduceOp &) *this , dest + i , src + i );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void init( value_type * dest ) const noexcept
|
||||
{
|
||||
for ( int i = 0 ; i < length() ; ++i ) {
|
||||
Reducer::template INIT<ReduceOp>::init( (ReduceOp &) *this , dest + i );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join( value_type * const dest
|
||||
, value_type const * const src ) const noexcept
|
||||
{
|
||||
for ( int i = 0 ; i < length() ; ++i ) {
|
||||
Reducer::template JOIN<ReduceOp,value_type>::join( (ReduceOp &) *this , dest + i , src + i );
|
||||
}
|
||||
}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void join( value_type volatile * const dest
|
||||
, value_type volatile const * const src ) const noexcept
|
||||
{
|
||||
for ( int i = 0 ; i < length() ; ++i ) {
|
||||
Reducer::template JOIN<ReduceOp,value_type volatile>::join( (ReduceOp &) *this , dest + i , src + i );
|
||||
}
|
||||
}
|
||||
|
||||
//--------------------------------------------------------------------------
|
||||
|
||||
template< typename ArgT >
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
constexpr Reducer
|
||||
( ArgT * arg_value
|
||||
, typename std::enable_if
|
||||
< std::is_same<ArgT,value_type>::value &&
|
||||
std::is_default_constructible< ReduceOp >::value
|
||||
, int >::type arg_length = 1
|
||||
) noexcept
|
||||
: ReduceOp(), length_t( arg_length ), m_result( arg_value ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
constexpr Reducer( ReduceOp const & arg_op
|
||||
, value_type * arg_value = 0
|
||||
, int arg_length = 1 ) noexcept
|
||||
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION explicit
|
||||
constexpr Reducer( ReduceOp && arg_op
|
||||
, value_type * arg_value = 0
|
||||
, int arg_length = 1 ) noexcept
|
||||
: ReduceOp( arg_op ), length_t( arg_length ), m_result( arg_value ) {}
|
||||
|
||||
Reducer( Reducer const & ) = default ;
|
||||
Reducer( Reducer && ) = default ;
|
||||
Reducer & operator = ( Reducer const & ) = default ;
|
||||
Reducer & operator = ( Reducer && ) = default ;
|
||||
};
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
template< typename ValueType >
|
||||
constexpr
|
||||
Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >
|
||||
Sum( ValueType & arg_value )
|
||||
{
|
||||
static_assert( std::is_trivial<ValueType>::value
|
||||
, "Kokkos reducer requires trivial value type" );
|
||||
return Impl::Reducer< ValueType , Impl::ReduceSum< ValueType > >( & arg_value );
|
||||
}
|
||||
|
||||
template< typename ValueType >
|
||||
constexpr
|
||||
Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >
|
||||
Sum( ValueType * arg_value , int arg_length )
|
||||
{
|
||||
static_assert( std::is_trivial<ValueType>::value
|
||||
, "Kokkos reducer requires trivial value type" );
|
||||
return Impl::Reducer< ValueType[] , Impl::ReduceSum< ValueType > >( arg_value , arg_length );
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ValueType , class JoinType >
|
||||
Impl::Reducer< ValueType , JoinType >
|
||||
reducer( ValueType & value , JoinType const & lambda )
|
||||
{
|
||||
return Impl::Reducer< ValueType , JoinType >( lambda , & value );
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
#endif /* #ifndef KOKKOS_IMPL_REDUCER_HPP */
|
||||
|
||||
@ -53,63 +53,126 @@
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
namespace SerialImpl {
|
||||
namespace {
|
||||
|
||||
Sentinel::Sentinel() : m_scratch(0), m_reduce_end(0), m_shared_end(0) {}
|
||||
HostThreadTeamData g_serial_thread_team_data ;
|
||||
|
||||
Sentinel::~Sentinel()
|
||||
{
|
||||
if ( m_scratch ) { free( m_scratch ); }
|
||||
m_scratch = 0 ;
|
||||
m_reduce_end = 0 ;
|
||||
m_shared_end = 0 ;
|
||||
}
|
||||
|
||||
Sentinel & Sentinel::singleton()
|
||||
// Resize thread team data scratch memory
|
||||
void serial_resize_thread_team_data( size_t pool_reduce_bytes
|
||||
, size_t team_reduce_bytes
|
||||
, size_t team_shared_bytes
|
||||
, size_t thread_local_bytes )
|
||||
{
|
||||
static Sentinel s ; return s ;
|
||||
}
|
||||
if ( pool_reduce_bytes < 512 ) pool_reduce_bytes = 512 ;
|
||||
if ( team_reduce_bytes < 512 ) team_reduce_bytes = 512 ;
|
||||
|
||||
inline
|
||||
unsigned align( unsigned n )
|
||||
{
|
||||
enum { ALIGN = 0x0100 /* 256 */ , MASK = ALIGN - 1 };
|
||||
return ( n + MASK ) & ~MASK ;
|
||||
}
|
||||
const size_t old_pool_reduce = g_serial_thread_team_data.pool_reduce_bytes();
|
||||
const size_t old_team_reduce = g_serial_thread_team_data.team_reduce_bytes();
|
||||
const size_t old_team_shared = g_serial_thread_team_data.team_shared_bytes();
|
||||
const size_t old_thread_local = g_serial_thread_team_data.thread_local_bytes();
|
||||
const size_t old_alloc_bytes = g_serial_thread_team_data.scratch_bytes();
|
||||
|
||||
} // namespace
|
||||
// Allocate if any of the old allocation is tool small:
|
||||
|
||||
SerialTeamMember::SerialTeamMember( int arg_league_rank
|
||||
, int arg_league_size
|
||||
, int arg_shared_size
|
||||
)
|
||||
: m_space( ((char *) SerialImpl::Sentinel::singleton().m_scratch) + SerialImpl::Sentinel::singleton().m_reduce_end
|
||||
, arg_shared_size )
|
||||
, m_league_rank( arg_league_rank )
|
||||
, m_league_size( arg_league_size )
|
||||
{}
|
||||
const bool allocate = ( old_pool_reduce < pool_reduce_bytes ) ||
|
||||
( old_team_reduce < team_reduce_bytes ) ||
|
||||
( old_team_shared < team_shared_bytes ) ||
|
||||
( old_thread_local < thread_local_bytes );
|
||||
|
||||
} // namespace Impl
|
||||
if ( allocate ) {
|
||||
|
||||
void * Serial::scratch_memory_resize( unsigned reduce_size , unsigned shared_size )
|
||||
{
|
||||
static Impl::SerialImpl::Sentinel & s = Impl::SerialImpl::Sentinel::singleton();
|
||||
Kokkos::HostSpace space ;
|
||||
|
||||
reduce_size = Impl::SerialImpl::align( reduce_size );
|
||||
shared_size = Impl::SerialImpl::align( shared_size );
|
||||
if ( old_alloc_bytes ) {
|
||||
g_serial_thread_team_data.disband_team();
|
||||
g_serial_thread_team_data.disband_pool();
|
||||
|
||||
if ( ( s.m_reduce_end < reduce_size ) ||
|
||||
( s.m_shared_end < s.m_reduce_end + shared_size ) ) {
|
||||
|
||||
if ( s.m_scratch ) { free( s.m_scratch ); }
|
||||
|
||||
if ( s.m_reduce_end < reduce_size ) s.m_reduce_end = reduce_size ;
|
||||
if ( s.m_shared_end < s.m_reduce_end + shared_size ) s.m_shared_end = s.m_reduce_end + shared_size ;
|
||||
|
||||
s.m_scratch = malloc( s.m_shared_end );
|
||||
space.deallocate( g_serial_thread_team_data.scratch_buffer()
|
||||
, g_serial_thread_team_data.scratch_bytes() );
|
||||
}
|
||||
|
||||
return s.m_scratch ;
|
||||
if ( pool_reduce_bytes < old_pool_reduce ) { pool_reduce_bytes = old_pool_reduce ; }
|
||||
if ( team_reduce_bytes < old_team_reduce ) { team_reduce_bytes = old_team_reduce ; }
|
||||
if ( team_shared_bytes < old_team_shared ) { team_shared_bytes = old_team_shared ; }
|
||||
if ( thread_local_bytes < old_thread_local ) { thread_local_bytes = old_thread_local ; }
|
||||
|
||||
const size_t alloc_bytes =
|
||||
HostThreadTeamData::scratch_size( pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
|
||||
void * const ptr = space.allocate( alloc_bytes );
|
||||
|
||||
g_serial_thread_team_data.
|
||||
scratch_assign( ((char *)ptr)
|
||||
, alloc_bytes
|
||||
, pool_reduce_bytes
|
||||
, team_reduce_bytes
|
||||
, team_shared_bytes
|
||||
, thread_local_bytes );
|
||||
|
||||
HostThreadTeamData * pool[1] = { & g_serial_thread_team_data };
|
||||
|
||||
g_serial_thread_team_data.organize_pool( pool , 1 );
|
||||
g_serial_thread_team_data.organize_team(1);
|
||||
}
|
||||
}
|
||||
|
||||
// Get thread team data structure for omp_get_thread_num()
|
||||
HostThreadTeamData * serial_get_thread_team_data()
|
||||
{
|
||||
return & g_serial_thread_team_data ;
|
||||
}
|
||||
|
||||
} // namespace Impl
|
||||
} // namespace Kokkos
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
int Serial::is_initialized()
|
||||
{
|
||||
return 1 ;
|
||||
}
|
||||
|
||||
void Serial::initialize( unsigned threads_count
|
||||
, unsigned use_numa_count
|
||||
, unsigned use_cores_per_numa
|
||||
, bool allow_asynchronous_threadpool )
|
||||
{
|
||||
(void) threads_count;
|
||||
(void) use_numa_count;
|
||||
(void) use_cores_per_numa;
|
||||
(void) allow_asynchronous_threadpool;
|
||||
|
||||
// Init the array of locks used for arbitrarily sized atomics
|
||||
Impl::init_lock_array_host_space();
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::initialize();
|
||||
#endif
|
||||
}
|
||||
|
||||
void Serial::finalize()
|
||||
{
|
||||
if ( Impl::g_serial_thread_team_data.scratch_buffer() ) {
|
||||
Impl::g_serial_thread_team_data.disband_team();
|
||||
Impl::g_serial_thread_team_data.disband_pool();
|
||||
|
||||
Kokkos::HostSpace space ;
|
||||
|
||||
space.deallocate( Impl::g_serial_thread_team_data.scratch_buffer()
|
||||
, Impl::g_serial_thread_team_data.scratch_bytes() );
|
||||
|
||||
Impl::g_serial_thread_team_data.scratch_assign( (void*) 0, 0, 0, 0, 0, 0 );
|
||||
}
|
||||
|
||||
#if defined(KOKKOS_ENABLE_PROFILING)
|
||||
Kokkos::Profiling::finalize();
|
||||
#endif
|
||||
}
|
||||
|
||||
} // namespace Kokkos
|
||||
|
||||
@ -62,11 +62,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
|
||||
using execution_space = Kokkos::Serial ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member exec ;
|
||||
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
|
||||
|
||||
Member exec( *data );
|
||||
|
||||
// Loop until all queues are empty
|
||||
while ( 0 < queue->m_ready_count ) {
|
||||
@ -75,13 +77,13 @@ void TaskQueueSpecialization< Kokkos::Serial >::execute
|
||||
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
if ( end != task ) {
|
||||
|
||||
// pop_task resulted in lock == task->m_next
|
||||
// pop_ready_task resulted in lock == task->m_next
|
||||
// In the executing state
|
||||
|
||||
(*task->m_apply)( task , & exec );
|
||||
@ -113,11 +115,13 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
|
||||
using execution_space = Kokkos::Serial ;
|
||||
using queue_type = TaskQueue< execution_space > ;
|
||||
using task_root_type = TaskBase< execution_space , void , void > ;
|
||||
using Member = TaskExec< execution_space > ;
|
||||
using Member = Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
Member exec ;
|
||||
Impl::HostThreadTeamData * const data = Impl::serial_get_thread_team_data();
|
||||
|
||||
Member exec( *data );
|
||||
|
||||
// Loop until no runnable task
|
||||
|
||||
@ -129,7 +133,7 @@ void TaskQueueSpecialization< Kokkos::Serial > ::
|
||||
|
||||
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
|
||||
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
|
||||
task = queue_type::pop_task( & queue->m_ready[i][j] );
|
||||
task = queue_type::pop_ready_task( & queue->m_ready[i][j] );
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -65,6 +65,7 @@ public:
|
||||
using memory_space = Kokkos::HostSpace ;
|
||||
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
|
||||
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
|
||||
using member_type = Kokkos::Impl::HostThreadTeamMember< execution_space > ;
|
||||
|
||||
static
|
||||
void iff_single_thread_recursive_execute( queue_type * const );
|
||||
@ -72,237 +73,19 @@ public:
|
||||
static
|
||||
void execute( queue_type * const );
|
||||
|
||||
template< typename FunctorType >
|
||||
template< typename TaskType >
|
||||
static
|
||||
void proc_set_apply( task_base_type::function_type * ptr )
|
||||
{
|
||||
using TaskType = TaskBase< Kokkos::Serial
|
||||
, typename FunctorType::value_type
|
||||
, FunctorType
|
||||
> ;
|
||||
*ptr = TaskType::apply ;
|
||||
}
|
||||
typename TaskType::function_type
|
||||
get_function_pointer() { return TaskType::apply ; }
|
||||
};
|
||||
|
||||
extern template class TaskQueue< Kokkos::Serial > ;
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<>
|
||||
class TaskExec< Kokkos::Serial >
|
||||
{
|
||||
public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
|
||||
KOKKOS_INLINE_FUNCTION int team_rank() const { return 0 ; }
|
||||
KOKKOS_INLINE_FUNCTION int team_size() const { return 1 ; }
|
||||
};
|
||||
|
||||
template<typename iType>
|
||||
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
|
||||
{
|
||||
typedef iType index_type;
|
||||
const iType start ;
|
||||
const iType end ;
|
||||
enum {increment = 1};
|
||||
//const TaskExec< Kokkos::Serial > & thread;
|
||||
TaskExec< Kokkos::Serial > & thread;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct
|
||||
//( const TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
|
||||
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
|
||||
: start(0)
|
||||
, end(arg_count)
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TeamThreadRangeBoundariesStruct
|
||||
//( const TaskExec< Kokkos::Serial > & arg_thread
|
||||
( TaskExec< Kokkos::Serial > & arg_thread
|
||||
, const iType& arg_start
|
||||
, const iType & arg_end
|
||||
)
|
||||
: start( arg_start )
|
||||
, end( arg_end)
|
||||
, thread( arg_thread )
|
||||
{}
|
||||
};
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template<typename iType>
|
||||
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Serial > >
|
||||
{
|
||||
typedef iType index_type;
|
||||
const iType start ;
|
||||
const iType end ;
|
||||
enum {increment = 1};
|
||||
TaskExec< Kokkos::Serial > & thread;
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
ThreadVectorRangeBoundariesStruct
|
||||
( TaskExec< Kokkos::Serial > & arg_thread, const iType& arg_count)
|
||||
: start( 0 )
|
||||
, end(arg_count)
|
||||
, thread(arg_thread)
|
||||
{}
|
||||
};
|
||||
|
||||
}} /* namespace Kokkos::Impl */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
namespace Kokkos {
|
||||
|
||||
// OMP version needs non-const TaskExec
|
||||
template< typename iType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >
|
||||
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType & count )
|
||||
{
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >( thread, count );
|
||||
}
|
||||
|
||||
// OMP version needs non-const TaskExec
|
||||
template< typename iType1, typename iType2 >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::TeamThreadRangeBoundariesStruct< typename std::common_type< iType1, iType2 >::type,
|
||||
Impl::TaskExec< Kokkos::Serial > >
|
||||
TeamThreadRange( Impl::TaskExec< Kokkos::Serial > & thread, const iType1 & start, const iType2 & end )
|
||||
{
|
||||
typedef typename std::common_type< iType1, iType2 >::type iType;
|
||||
return Impl::TeamThreadRangeBoundariesStruct< iType, Impl::TaskExec< Kokkos::Serial > >(
|
||||
thread, iType(start), iType(end) );
|
||||
}
|
||||
|
||||
// OMP version needs non-const TaskExec
|
||||
template<typename iType>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >
|
||||
ThreadVectorRange
|
||||
( Impl::TaskExec< Kokkos::Serial > & thread
|
||||
, const iType & count )
|
||||
{
|
||||
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >(thread,count);
|
||||
}
|
||||
|
||||
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
|
||||
*
|
||||
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
|
||||
* This functionality requires C++11 support.*/
|
||||
template<typename iType, class Lambda>
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_for(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries, const Lambda& lambda) {
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i);
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i, result);
|
||||
|
||||
initialized_result = result;
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
ValueType result = initialized_result;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment)
|
||||
lambda(i, result);
|
||||
|
||||
initialized_result = result;
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
initialized_result = ValueType();
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
initialized_result+=tmp;
|
||||
}
|
||||
}
|
||||
|
||||
template< typename iType, class Lambda, typename ValueType, class JoinType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_reduce
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda,
|
||||
const JoinType & join,
|
||||
ValueType& initialized_result)
|
||||
{
|
||||
ValueType result = initialized_result;
|
||||
#ifdef KOKKOS_ENABLE_PRAGMA_IVDEP
|
||||
#pragma ivdep
|
||||
#endif
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
ValueType tmp = ValueType();
|
||||
lambda(i,tmp);
|
||||
join(result,tmp);
|
||||
}
|
||||
initialized_result = result;
|
||||
}
|
||||
|
||||
template< typename ValueType, typename iType, class Lambda >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
ValueType accum = 0 ;
|
||||
ValueType val, local_total;
|
||||
|
||||
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
|
||||
local_total = 0;
|
||||
lambda(i,local_total,false);
|
||||
val = accum;
|
||||
lambda(i,val,true);
|
||||
accum += local_total;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// placeholder for future function
|
||||
template< typename iType, class Lambda, typename ValueType >
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void parallel_scan
|
||||
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Serial > >& loop_boundaries,
|
||||
const Lambda & lambda)
|
||||
{
|
||||
}
|
||||
|
||||
} /* namespace Kokkos */
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
#endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */
|
||||
#endif /* #ifndef KOKKOS_IMPL_SERIAL_TASK_HPP */
|
||||
|
||||
|
||||
@ -1,693 +0,0 @@
|
||||
/*
|
||||
|
||||
Copyright (c) 2014, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SYNCHRONIC_HPP
|
||||
#define KOKKOS_SYNCHRONIC_HPP
|
||||
|
||||
#include <impl/Kokkos_Synchronic_Config.hpp>
|
||||
|
||||
#include <atomic>
|
||||
#include <chrono>
|
||||
#include <thread>
|
||||
#include <functional>
|
||||
#include <algorithm>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
enum notify_hint {
|
||||
notify_all,
|
||||
notify_one,
|
||||
notify_none
|
||||
};
|
||||
enum expect_hint {
|
||||
expect_urgent,
|
||||
expect_delay
|
||||
};
|
||||
|
||||
namespace Details {
|
||||
|
||||
template <class S, class T>
|
||||
bool __synchronic_spin_wait_for_update(S const& arg, T const& nval, int attempts) noexcept {
|
||||
int i = 0;
|
||||
for(;i < __SYNCHRONIC_SPIN_RELAX(attempts); ++i)
|
||||
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
|
||||
return true;
|
||||
else
|
||||
__synchronic_relax();
|
||||
for(;i < attempts; ++i)
|
||||
if(__builtin_expect(arg.load(std::memory_order_relaxed) != nval,1))
|
||||
return true;
|
||||
else
|
||||
__synchronic_yield();
|
||||
return false;
|
||||
}
|
||||
|
||||
struct __exponential_backoff {
|
||||
__exponential_backoff(int arg_maximum=512) : maximum(arg_maximum), microseconds(8), x(123456789), y(362436069), z(521288629) {
|
||||
}
|
||||
static inline void sleep_for(std::chrono::microseconds const& time) {
|
||||
auto t = time.count();
|
||||
if(__builtin_expect(t > 75,0)) {
|
||||
portable_sleep(time);
|
||||
}
|
||||
else if(__builtin_expect(t > 25,0))
|
||||
__synchronic_yield();
|
||||
else
|
||||
__synchronic_relax();
|
||||
}
|
||||
void sleep_for_step() {
|
||||
sleep_for(step());
|
||||
}
|
||||
std::chrono::microseconds step() {
|
||||
float const f = ranfu();
|
||||
int const t = int(microseconds * f);
|
||||
if(__builtin_expect(f >= 0.95f,0))
|
||||
microseconds = 8;
|
||||
else
|
||||
microseconds = (std::min)(microseconds>>1,maximum);
|
||||
return std::chrono::microseconds(t);
|
||||
}
|
||||
private :
|
||||
int maximum, microseconds, x, y, z;
|
||||
int xorshf96() {
|
||||
int t;
|
||||
x ^= x << 16; x ^= x >> 5; x ^= x << 1;
|
||||
t = x; x = y; y = z; z = t ^ x ^ y;
|
||||
return z;
|
||||
}
|
||||
float ranfu() {
|
||||
return (float)(xorshf96()&(~0UL>>1)) / (float)(~0UL>>1);
|
||||
}
|
||||
};
|
||||
|
||||
template <class T, class Enable = void>
|
||||
struct __synchronic_base {
|
||||
|
||||
protected:
|
||||
std::atomic<T> atom;
|
||||
|
||||
void notify(notify_hint = notify_all) noexcept {
|
||||
}
|
||||
void notify(notify_hint = notify_all) volatile noexcept {
|
||||
}
|
||||
|
||||
public :
|
||||
__synchronic_base() noexcept = default;
|
||||
constexpr __synchronic_base(T v) noexcept : atom(v) { }
|
||||
__synchronic_base(const __synchronic_base&) = delete;
|
||||
~__synchronic_base() { }
|
||||
__synchronic_base& operator=(const __synchronic_base&) = delete;
|
||||
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
|
||||
|
||||
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
|
||||
return;
|
||||
__exponential_backoff b;
|
||||
while(atom.load(std::memory_order_relaxed) == val) {
|
||||
__do_backoff(b);
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
|
||||
return;
|
||||
}
|
||||
}
|
||||
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
|
||||
return;
|
||||
__exponential_backoff b;
|
||||
while(atom.load(std::memory_order_relaxed) == val) {
|
||||
__do_backoff(b);
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
template <class Clock, class Duration>
|
||||
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
|
||||
return;
|
||||
__exponential_backoff b;
|
||||
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
|
||||
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
|
||||
__do_backoff(b);
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
|
||||
return;
|
||||
remains = then - std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
template <class Clock, class Duration>
|
||||
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_A))
|
||||
return;
|
||||
__exponential_backoff b;
|
||||
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
|
||||
while(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val) {
|
||||
__do_backoff(b);
|
||||
if(__synchronic_spin_wait_for_update(atom, val, __SYNCHRONIC_SPIN_COUNT_B))
|
||||
return;
|
||||
remains = then - std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
#ifdef __SYNCHRONIC_COMPATIBLE
|
||||
template <class T>
|
||||
struct __synchronic_base<T, typename std::enable_if<__SYNCHRONIC_COMPATIBLE(T)>::type> {
|
||||
|
||||
public:
|
||||
std::atomic<T> atom;
|
||||
|
||||
void notify(notify_hint hint = notify_all) noexcept {
|
||||
if(__builtin_expect(hint == notify_none,1))
|
||||
return;
|
||||
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
|
||||
if(__builtin_expect(x,0)) {
|
||||
if(__builtin_expect(hint == notify_all,1))
|
||||
__synchronic_wake_all(&atom);
|
||||
else
|
||||
__synchronic_wake_one(&atom);
|
||||
}
|
||||
}
|
||||
void notify(notify_hint hint = notify_all) volatile noexcept {
|
||||
if(__builtin_expect(hint == notify_none,1))
|
||||
return;
|
||||
auto const x = count.fetch_add(0,std::memory_order_acq_rel);
|
||||
if(__builtin_expect(x,0)) {
|
||||
if(__builtin_expect(hint == notify_all,1))
|
||||
__synchronic_wake_all_volatile(&atom);
|
||||
else
|
||||
__synchronic_wake_one_volatile(&atom);
|
||||
}
|
||||
}
|
||||
|
||||
public :
|
||||
__synchronic_base() noexcept : count(0) { }
|
||||
constexpr __synchronic_base(T v) noexcept : atom(v), count(0) { }
|
||||
__synchronic_base(const __synchronic_base&) = delete;
|
||||
~__synchronic_base() { }
|
||||
__synchronic_base& operator=(const __synchronic_base&) = delete;
|
||||
__synchronic_base& operator=(const __synchronic_base&) volatile = delete;
|
||||
|
||||
void expect_update(T val, expect_hint = expect_urgent) const noexcept {
|
||||
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
|
||||
return;
|
||||
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
|
||||
count.fetch_add(1,std::memory_order_release);
|
||||
__synchronic_wait(&atom,val);
|
||||
count.fetch_add(-1,std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
void expect_update(T val, expect_hint = expect_urgent) const volatile noexcept {
|
||||
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
|
||||
return;
|
||||
while(__builtin_expect(atom.load(std::memory_order_relaxed) == val,1)) {
|
||||
count.fetch_add(1,std::memory_order_release);
|
||||
__synchronic_wait_volatile(&atom,val);
|
||||
count.fetch_add(-1,std::memory_order_acquire);
|
||||
}
|
||||
}
|
||||
|
||||
template <class Clock, class Duration>
|
||||
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const {
|
||||
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
|
||||
return;
|
||||
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
|
||||
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
|
||||
count.fetch_add(1,std::memory_order_release);
|
||||
__synchronic_wait_timed(&atom,val,remains);
|
||||
count.fetch_add(-1,std::memory_order_acquire);
|
||||
remains = then - std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
template <class Clock, class Duration>
|
||||
void expect_update_until(T val, std::chrono::time_point<Clock,Duration> const& then, expect_hint = expect_urgent) const volatile {
|
||||
if(__builtin_expect(__synchronic_spin_wait_for_update(atom, val,__SYNCHRONIC_SPIN_COUNT_A),1))
|
||||
return;
|
||||
std::chrono::milliseconds remains = then - std::chrono::high_resolution_clock::now();
|
||||
while(__builtin_expect(remains > std::chrono::milliseconds::zero() && atom.load(std::memory_order_relaxed) == val,1)) {
|
||||
count.fetch_add(1,std::memory_order_release);
|
||||
__synchronic_wait_timed_volatile(&atom,val,remains);
|
||||
count.fetch_add(-1,std::memory_order_acquire);
|
||||
remains = then - std::chrono::high_resolution_clock::now();
|
||||
}
|
||||
}
|
||||
private:
|
||||
mutable std::atomic<int> count;
|
||||
};
|
||||
#endif
|
||||
|
||||
template <class T, class Enable = void>
|
||||
struct __synchronic : public __synchronic_base<T> {
|
||||
|
||||
__synchronic() noexcept = default;
|
||||
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
|
||||
__synchronic(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) volatile = delete;
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct __synchronic<T,typename std::enable_if<std::is_integral<T>::value>::type> : public __synchronic_base<T> {
|
||||
|
||||
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_add(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_add(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_add(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_sub(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_sub(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_sub(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_and(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_and(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_and(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_or(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_or(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_or(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_xor(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T fetch_xor(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_xor(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
|
||||
__synchronic() noexcept = default;
|
||||
constexpr __synchronic(T v) noexcept : __synchronic_base<T>(v) { }
|
||||
__synchronic(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) volatile = delete;
|
||||
|
||||
T operator=(T v) volatile noexcept {
|
||||
auto const t = this->atom = v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator=(T v) noexcept {
|
||||
auto const t = this->atom = v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator++(int) volatile noexcept {
|
||||
auto const t = ++this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator++(int) noexcept {
|
||||
auto const t = ++this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator--(int) volatile noexcept {
|
||||
auto const t = --this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator--(int) noexcept {
|
||||
auto const t = --this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator++() volatile noexcept {
|
||||
auto const t = this->atom++;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator++() noexcept {
|
||||
auto const t = this->atom++;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator--() volatile noexcept {
|
||||
auto const t = this->atom--;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator--() noexcept {
|
||||
auto const t = this->atom--;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator+=(T v) volatile noexcept {
|
||||
auto const t = this->atom += v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator+=(T v) noexcept {
|
||||
auto const t = this->atom += v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator-=(T v) volatile noexcept {
|
||||
auto const t = this->atom -= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator-=(T v) noexcept {
|
||||
auto const t = this->atom -= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator&=(T v) volatile noexcept {
|
||||
auto const t = this->atom &= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator&=(T v) noexcept {
|
||||
auto const t = this->atom &= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator|=(T v) volatile noexcept {
|
||||
auto const t = this->atom |= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator|=(T v) noexcept {
|
||||
auto const t = this->atom |= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator^=(T v) volatile noexcept {
|
||||
auto const t = this->atom ^= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T operator^=(T v) noexcept {
|
||||
auto const t = this->atom ^= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
};
|
||||
|
||||
template <class T>
|
||||
struct __synchronic<T*> : public __synchronic_base<T*> {
|
||||
|
||||
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_add(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T* fetch_add(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_add(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.fetch_sub(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T* fetch_sub(ptrdiff_t v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.fetch_sub(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
|
||||
__synchronic() noexcept = default;
|
||||
constexpr __synchronic(T* v) noexcept : __synchronic_base<T*>(v) { }
|
||||
__synchronic(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) = delete;
|
||||
__synchronic& operator=(const __synchronic&) volatile = delete;
|
||||
|
||||
T* operator=(T* v) volatile noexcept {
|
||||
auto const t = this->atom = v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator=(T* v) noexcept {
|
||||
auto const t = this->atom = v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator++(int) volatile noexcept {
|
||||
auto const t = ++this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator++(int) noexcept {
|
||||
auto const t = ++this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator--(int) volatile noexcept {
|
||||
auto const t = --this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator--(int) noexcept {
|
||||
auto const t = --this->atom;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator++() volatile noexcept {
|
||||
auto const t = this->atom++;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator++() noexcept {
|
||||
auto const t = this->atom++;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator--() volatile noexcept {
|
||||
auto const t = this->atom--;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator--() noexcept {
|
||||
auto const t = this->atom--;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator+=(ptrdiff_t v) volatile noexcept {
|
||||
auto const t = this->atom += v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator+=(ptrdiff_t v) noexcept {
|
||||
auto const t = this->atom += v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator-=(ptrdiff_t v) volatile noexcept {
|
||||
auto const t = this->atom -= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
T* operator-=(ptrdiff_t v) noexcept {
|
||||
auto const t = this->atom -= v;
|
||||
this->notify();
|
||||
return t;
|
||||
}
|
||||
};
|
||||
|
||||
} //namespace Details
|
||||
|
||||
template <class T>
|
||||
struct synchronic : public Details::__synchronic<T> {
|
||||
|
||||
bool is_lock_free() const volatile noexcept { return this->atom.is_lock_free(); }
|
||||
bool is_lock_free() const noexcept { return this->atom.is_lock_free(); }
|
||||
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
this->atom.store(v,m);
|
||||
this->notify(n);
|
||||
}
|
||||
void store(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
this->atom.store(v,m);
|
||||
this->notify(n);
|
||||
}
|
||||
T load(std::memory_order m = std::memory_order_seq_cst) const volatile noexcept { return this->atom.load(m); }
|
||||
T load(std::memory_order m = std::memory_order_seq_cst) const noexcept { return this->atom.load(m); }
|
||||
|
||||
operator T() const volatile noexcept { return (T)this->atom; }
|
||||
operator T() const noexcept { return (T)this->atom; }
|
||||
|
||||
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.exchange(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
T exchange(T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.exchange(v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.compare_exchange_weak(r,v,m1,m2);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_weak(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.compare_exchange_weak(r,v,m1, m2);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_strong(T& r, T v, std::memory_order m1, std::memory_order m2, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.compare_exchange_strong(r,v,m1,m2);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.compare_exchange_weak(r,v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_weak(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.compare_exchange_weak(r,v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) volatile noexcept {
|
||||
auto const t = this->atom.compare_exchange_strong(r,v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
bool compare_exchange_strong(T& r, T v, std::memory_order m = std::memory_order_seq_cst, notify_hint n = notify_all) noexcept {
|
||||
auto const t = this->atom.compare_exchange_strong(r,v,m);
|
||||
this->notify(n);
|
||||
return t;
|
||||
}
|
||||
|
||||
synchronic() noexcept = default;
|
||||
constexpr synchronic(T val) noexcept : Details::__synchronic<T>(val) { }
|
||||
synchronic(const synchronic&) = delete;
|
||||
~synchronic() { }
|
||||
synchronic& operator=(const synchronic&) = delete;
|
||||
synchronic& operator=(const synchronic&) volatile = delete;
|
||||
T operator=(T val) noexcept {
|
||||
return Details::__synchronic<T>::operator=(val);
|
||||
}
|
||||
T operator=(T val) volatile noexcept {
|
||||
return Details::__synchronic<T>::operator=(val);
|
||||
}
|
||||
|
||||
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
|
||||
Details::__synchronic<T>::expect_update(val,h);
|
||||
return load(order);
|
||||
}
|
||||
T load_when_not_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
|
||||
Details::__synchronic<T>::expect_update(val,h);
|
||||
return load(order);
|
||||
}
|
||||
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const noexcept {
|
||||
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
|
||||
Details::__synchronic<T>::expect_update(nval,h);
|
||||
return load(order);
|
||||
}
|
||||
T load_when_equal(T val, std::memory_order order = std::memory_order_seq_cst, expect_hint h = expect_urgent) const volatile noexcept {
|
||||
for(T nval = load(std::memory_order_relaxed); nval != val; nval = load(std::memory_order_relaxed))
|
||||
expect_update(nval,h);
|
||||
return load(order);
|
||||
}
|
||||
template <class Rep, class Period>
|
||||
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const {
|
||||
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
|
||||
}
|
||||
template < class Rep, class Period>
|
||||
void expect_update_for(T val, std::chrono::duration<Rep,Period> const& delta, expect_hint h = expect_urgent) const volatile {
|
||||
Details::__synchronic<T>::expect_update_until(val, std::chrono::high_resolution_clock::now() + delta,h);
|
||||
}
|
||||
};
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
typedef synchronic<char> synchronic_char;
|
||||
typedef synchronic<char> synchronic_schar;
|
||||
typedef synchronic<unsigned char> synchronic_uchar;
|
||||
typedef synchronic<short> synchronic_short;
|
||||
typedef synchronic<unsigned short> synchronic_ushort;
|
||||
typedef synchronic<int> synchronic_int;
|
||||
typedef synchronic<unsigned int> synchronic_uint;
|
||||
typedef synchronic<long> synchronic_long;
|
||||
typedef synchronic<unsigned long> synchronic_ulong;
|
||||
typedef synchronic<long long> synchronic_llong;
|
||||
typedef synchronic<unsigned long long> synchronic_ullong;
|
||||
//typedef synchronic<char16_t> synchronic_char16_t;
|
||||
//typedef synchronic<char32_t> synchronic_char32_t;
|
||||
typedef synchronic<wchar_t> synchronic_wchar_t;
|
||||
|
||||
typedef synchronic<int_least8_t> synchronic_int_least8_t;
|
||||
typedef synchronic<uint_least8_t> synchronic_uint_least8_t;
|
||||
typedef synchronic<int_least16_t> synchronic_int_least16_t;
|
||||
typedef synchronic<uint_least16_t> synchronic_uint_least16_t;
|
||||
typedef synchronic<int_least32_t> synchronic_int_least32_t;
|
||||
typedef synchronic<uint_least32_t> synchronic_uint_least32_t;
|
||||
//typedef synchronic<int_least_64_t> synchronic_int_least_64_t;
|
||||
typedef synchronic<uint_least64_t> synchronic_uint_least64_t;
|
||||
typedef synchronic<int_fast8_t> synchronic_int_fast8_t;
|
||||
typedef synchronic<uint_fast8_t> synchronic_uint_fast8_t;
|
||||
typedef synchronic<int_fast16_t> synchronic_int_fast16_t;
|
||||
typedef synchronic<uint_fast16_t> synchronic_uint_fast16_t;
|
||||
typedef synchronic<int_fast32_t> synchronic_int_fast32_t;
|
||||
typedef synchronic<uint_fast32_t> synchronic_uint_fast32_t;
|
||||
typedef synchronic<int_fast64_t> synchronic_int_fast64_t;
|
||||
typedef synchronic<uint_fast64_t> synchronic_uint_fast64_t;
|
||||
typedef synchronic<intptr_t> synchronic_intptr_t;
|
||||
typedef synchronic<uintptr_t> synchronic_uintptr_t;
|
||||
typedef synchronic<size_t> synchronic_size_t;
|
||||
typedef synchronic<ptrdiff_t> synchronic_ptrdiff_t;
|
||||
typedef synchronic<intmax_t> synchronic_intmax_t;
|
||||
typedef synchronic<uintmax_t> synchronic_uintmax_t;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif //__SYNCHRONIC_H
|
||||
@ -1,169 +0,0 @@
|
||||
/*
|
||||
|
||||
Copyright (c) 2014, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SYNCHRONIC_CONFIG_H
|
||||
#define KOKKOS_SYNCHRONIC_CONFIG_H
|
||||
|
||||
#include <thread>
|
||||
#include <chrono>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
//the default yield function used inside the implementation is the Standard one
|
||||
#define __synchronic_yield std::this_thread::yield
|
||||
#define __synchronic_relax __synchronic_yield
|
||||
|
||||
#if defined(_MSC_VER)
|
||||
//this is a handy GCC optimization that I use inside the implementation
|
||||
#define __builtin_expect(condition,common) condition
|
||||
#if _MSC_VER <= 1800
|
||||
//using certain keywords that VC++ temporarily doesn't support
|
||||
#define _ALLOW_KEYWORD_MACROS
|
||||
#define noexcept
|
||||
#define constexpr
|
||||
#endif
|
||||
//yes, I define multiple assignment operators
|
||||
#pragma warning(disable:4522)
|
||||
//I don't understand how Windows is so bad at timing functions, but is OK
|
||||
//with straight-up yield loops
|
||||
#define __do_backoff(b) __synchronic_yield()
|
||||
#else
|
||||
#define __do_backoff(b) b.sleep_for_step()
|
||||
#endif
|
||||
|
||||
//certain platforms have efficient support for spin-waiting built into the operating system
|
||||
#if defined(__linux__) || (defined(_WIN32_WINNT) && _WIN32_WINNT >= 0x0602)
|
||||
#if defined(_WIN32_WINNT)
|
||||
#include <winsock2.h>
|
||||
#include <Windows.h>
|
||||
//the combination of WaitOnAddress and WakeByAddressAll is supported on Windows 8.1+
|
||||
#define __synchronic_wait(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
|
||||
#define __synchronic_wait_timed(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
|
||||
#define __synchronic_wake_one(x) WakeByAddressSingle((PVOID)x)
|
||||
#define __synchronic_wake_all(x) WakeByAddressAll((PVOID)x)
|
||||
#define __synchronic_wait_volatile(x,v) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),-1)
|
||||
#define __synchronic_wait_timed_volatile(x,v,t) WaitOnAddress((PVOID)x,(PVOID)&v,sizeof(v),std::chrono::duration_cast<std::chrono::milliseconds>(t).count())
|
||||
#define __synchronic_wake_one_volatile(x) WakeByAddressSingle((PVOID)x)
|
||||
#define __synchronic_wake_all_volatile(x) WakeByAddressAll((PVOID)x)
|
||||
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_pod<x>::value && (sizeof(x) <= 8))
|
||||
|
||||
inline void native_sleep(unsigned long microseconds)
|
||||
{
|
||||
// What to do if microseconds is < 1000?
|
||||
Sleep(microseconds / 1000);
|
||||
}
|
||||
|
||||
inline void native_yield()
|
||||
{
|
||||
SwitchToThread();
|
||||
}
|
||||
#elif defined(__linux__)
|
||||
#include <chrono>
|
||||
#include <time.h>
|
||||
#include <unistd.h>
|
||||
#include <pthread.h>
|
||||
#include <linux/futex.h>
|
||||
#include <sys/syscall.h>
|
||||
#include <climits>
|
||||
#include <cassert>
|
||||
template < class Rep, class Period>
|
||||
inline timespec to_timespec(std::chrono::duration<Rep,Period> const& delta) {
|
||||
struct timespec ts;
|
||||
ts.tv_sec = static_cast<long>(std::chrono::duration_cast<std::chrono::seconds>(delta).count());
|
||||
assert(!ts.tv_sec);
|
||||
ts.tv_nsec = static_cast<long>(std::chrono::duration_cast<std::chrono::nanoseconds>(delta).count());
|
||||
return ts;
|
||||
}
|
||||
inline long futex(void const* addr1, int op, int val1) {
|
||||
return syscall(SYS_futex, addr1, op, val1, 0, 0, 0);
|
||||
}
|
||||
inline long futex(void const* addr1, int op, int val1, struct timespec timeout) {
|
||||
return syscall(SYS_futex, addr1, op, val1, &timeout, 0, 0);
|
||||
}
|
||||
inline void native_sleep(unsigned long microseconds)
|
||||
{
|
||||
usleep(microseconds);
|
||||
}
|
||||
inline void native_yield()
|
||||
{
|
||||
pthread_yield();
|
||||
}
|
||||
|
||||
//the combination of SYS_futex(WAIT) and SYS_futex(WAKE) is supported on all recent Linux distributions
|
||||
#define __synchronic_wait(x,v) futex(x, FUTEX_WAIT_PRIVATE, v)
|
||||
#define __synchronic_wait_timed(x,v,t) futex(x, FUTEX_WAIT_PRIVATE, v, to_timespec(t))
|
||||
#define __synchronic_wake_one(x) futex(x, FUTEX_WAKE_PRIVATE, 1)
|
||||
#define __synchronic_wake_all(x) futex(x, FUTEX_WAKE_PRIVATE, INT_MAX)
|
||||
#define __synchronic_wait_volatile(x,v) futex(x, FUTEX_WAIT, v)
|
||||
#define __synchronic_wait_volatile_timed(x,v,t) futex(x, FUTEX_WAIT, v, to_timespec(t))
|
||||
#define __synchronic_wake_one_volatile(x) futex(x, FUTEX_WAKE, 1)
|
||||
#define __synchronic_wake_all_volatile(x) futex(x, FUTEX_WAKE, INT_MAX)
|
||||
#define __SYNCHRONIC_COMPATIBLE(x) (std::is_integral<x>::value && (sizeof(x) <= 4))
|
||||
|
||||
//the yield function on Linux is better replaced by sched_yield, which is tuned for spin-waiting
|
||||
#undef __synchronic_yield
|
||||
#define __synchronic_yield sched_yield
|
||||
|
||||
//for extremely short wait times, just let another hyper-thread run
|
||||
#undef __synchronic_relax
|
||||
#define __synchronic_relax() asm volatile("rep; nop" ::: "memory")
|
||||
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef _GLIBCXX_USE_NANOSLEEP
|
||||
inline void portable_sleep(std::chrono::microseconds const& time)
|
||||
{ std::this_thread::sleep_for(time); }
|
||||
#else
|
||||
inline void portable_sleep(std::chrono::microseconds const& time)
|
||||
{ native_sleep(time.count()); }
|
||||
#endif
|
||||
|
||||
#ifdef _GLIBCXX_USE_SCHED_YIELD
|
||||
inline void portable_yield()
|
||||
{ std::this_thread::yield(); }
|
||||
#else
|
||||
inline void portable_yield()
|
||||
{ native_yield(); }
|
||||
#endif
|
||||
|
||||
//this is the number of times we initially spin, on the first wait attempt
|
||||
#define __SYNCHRONIC_SPIN_COUNT_A 16
|
||||
|
||||
//this is how decide to yield instead of just spinning, 'c' is the current trip count
|
||||
//#define __SYNCHRONIC_SPIN_YIELD(c) true
|
||||
#define __SYNCHRONIC_SPIN_RELAX(c) (c>>3)
|
||||
|
||||
//this is the number of times we normally spin, on every subsequent wait attempt
|
||||
#define __SYNCHRONIC_SPIN_COUNT_B 8
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
#endif //__SYNCHRONIC_CONFIG_H
|
||||
@ -1,162 +0,0 @@
|
||||
/*
|
||||
|
||||
Copyright (c) 2014, NVIDIA Corporation
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without modification,
|
||||
are permitted provided that the following conditions are met:
|
||||
|
||||
1. Redistributions of source code must retain the above copyright notice, this
|
||||
list of conditions and the following disclaimer.
|
||||
|
||||
2. Redistributions in binary form must reproduce the above copyright notice,
|
||||
this list of conditions and the following disclaimer in the documentation
|
||||
and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
||||
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
||||
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
|
||||
IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
|
||||
INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
|
||||
BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||||
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
|
||||
OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
|
||||
OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
|
||||
*/
|
||||
|
||||
#ifndef KOKKOS_SYNCHRONIC_N3998_HPP
|
||||
#define KOKKOS_SYNCHRONIC_N3998_HPP
|
||||
|
||||
#include <impl/Kokkos_Synchronic.hpp>
|
||||
#include <functional>
|
||||
|
||||
/*
|
||||
In the section below, a synchronization point represents a point at which a
|
||||
thread may block until a given synchronization condition has been reached or
|
||||
at which it may notify other threads that a synchronization condition has
|
||||
been achieved.
|
||||
*/
|
||||
namespace Kokkos { namespace Impl {
|
||||
|
||||
/*
|
||||
A latch maintains an internal counter that is initialized when the latch
|
||||
is created. The synchronization condition is reached when the counter is
|
||||
decremented to 0. Threads may block at a synchronization point waiting
|
||||
for the condition to be reached. When the condition is reached, any such
|
||||
blocked threads will be released.
|
||||
*/
|
||||
struct latch {
|
||||
latch(int val) : count(val), released(false) { }
|
||||
latch(const latch&) = delete;
|
||||
latch& operator=(const latch&) = delete;
|
||||
~latch( ) { }
|
||||
void arrive( ) {
|
||||
__arrive( );
|
||||
}
|
||||
void arrive_and_wait( ) {
|
||||
if(!__arrive( ))
|
||||
wait( );
|
||||
}
|
||||
void wait( ) {
|
||||
while(!released.load_when_not_equal(false,std::memory_order_acquire))
|
||||
;
|
||||
}
|
||||
bool try_wait( ) {
|
||||
return released.load(std::memory_order_acquire);
|
||||
}
|
||||
private:
|
||||
bool __arrive( ) {
|
||||
if(count.fetch_add(-1,std::memory_order_release)!=1)
|
||||
return false;
|
||||
released.store(true,std::memory_order_release);
|
||||
return true;
|
||||
}
|
||||
std::atomic<int> count;
|
||||
synchronic<bool> released;
|
||||
};
|
||||
|
||||
/*
|
||||
A barrier is created with an initial value representing the number of threads
|
||||
that can arrive at the synchronization point. When that many threads have
|
||||
arrived, the synchronization condition is reached and the threads are
|
||||
released. The barrier will then reset, and may be reused for a new cycle, in
|
||||
which the same set of threads may arrive again at the synchronization point.
|
||||
The same set of threads shall arrive at the barrier in each cycle, otherwise
|
||||
the behaviour is undefined.
|
||||
*/
|
||||
struct barrier {
|
||||
barrier(int val) : expected(val), arrived(0), nexpected(val), epoch(0) { }
|
||||
barrier(const barrier&) = delete;
|
||||
barrier& operator=(const barrier&) = delete;
|
||||
~barrier() { }
|
||||
void arrive_and_wait() {
|
||||
int const myepoch = epoch.load(std::memory_order_relaxed);
|
||||
if(!__arrive(myepoch))
|
||||
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
|
||||
;
|
||||
}
|
||||
void arrive_and_drop() {
|
||||
nexpected.fetch_add(-1,std::memory_order_relaxed);
|
||||
__arrive(epoch.load(std::memory_order_relaxed));
|
||||
}
|
||||
private:
|
||||
bool __arrive(int const myepoch) {
|
||||
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
|
||||
if(__builtin_expect(myresult == expected,0)) {
|
||||
expected = nexpected.load(std::memory_order_relaxed);
|
||||
arrived.store(0,std::memory_order_relaxed);
|
||||
epoch.store(myepoch+1,std::memory_order_release);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
int expected;
|
||||
std::atomic<int> arrived, nexpected;
|
||||
synchronic<int> epoch;
|
||||
};
|
||||
|
||||
/*
|
||||
A notifying barrier behaves as a barrier, but is constructed with a callable
|
||||
completion function that is invoked after all threads have arrived at the
|
||||
synchronization point, and before the synchronization condition is reached.
|
||||
The completion may modify the set of threads that arrives at the barrier in
|
||||
each cycle.
|
||||
*/
|
||||
struct notifying_barrier {
|
||||
template <typename T>
|
||||
notifying_barrier(int val, T && f) : expected(val), arrived(0), nexpected(val), epoch(0), completion(std::forward<T>(f)) { }
|
||||
notifying_barrier(const notifying_barrier&) = delete;
|
||||
notifying_barrier& operator=(const notifying_barrier&) = delete;
|
||||
~notifying_barrier( ) { }
|
||||
void arrive_and_wait() {
|
||||
int const myepoch = epoch.load(std::memory_order_relaxed);
|
||||
if(!__arrive(myepoch))
|
||||
while(epoch.load_when_not_equal(myepoch,std::memory_order_acquire) == myepoch)
|
||||
;
|
||||
}
|
||||
void arrive_and_drop() {
|
||||
nexpected.fetch_add(-1,std::memory_order_relaxed);
|
||||
__arrive(epoch.load(std::memory_order_relaxed));
|
||||
}
|
||||
private:
|
||||
bool __arrive(int const myepoch) {
|
||||
int const myresult = arrived.fetch_add(1,std::memory_order_acq_rel) + 1;
|
||||
if(__builtin_expect(myresult == expected,0)) {
|
||||
int const newexpected = completion();
|
||||
expected = newexpected ? newexpected : nexpected.load(std::memory_order_relaxed);
|
||||
arrived.store(0,std::memory_order_relaxed);
|
||||
epoch.store(myepoch+1,std::memory_order_release);
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
int expected;
|
||||
std::atomic<int> arrived, nexpected;
|
||||
synchronic<int> epoch;
|
||||
std::function<int()> completion;
|
||||
};
|
||||
}}
|
||||
|
||||
#endif //__N3998_H
|
||||
@ -76,9 +76,6 @@ namespace Impl {
|
||||
template< typename Space , typename ResultType , typename FunctorType >
|
||||
class TaskBase ;
|
||||
|
||||
template< typename Space >
|
||||
class TaskExec ;
|
||||
|
||||
} /* namespace Impl */
|
||||
} /* namespace Kokkos */
|
||||
|
||||
@ -149,8 +146,8 @@ private:
|
||||
// task->m_next is the dependence or zero
|
||||
// Postcondition:
|
||||
// task->m_next is linked list membership
|
||||
KOKKOS_FUNCTION
|
||||
void schedule( task_root_type * const );
|
||||
KOKKOS_FUNCTION void schedule_runnable( task_root_type * const );
|
||||
KOKKOS_FUNCTION void schedule_aggregate( task_root_type * const );
|
||||
|
||||
// Reschedule a task
|
||||
// Precondition:
|
||||
@ -178,7 +175,7 @@ private:
|
||||
, task_root_type * const );
|
||||
|
||||
KOKKOS_FUNCTION
|
||||
static task_root_type * pop_task( task_root_type * volatile * const );
|
||||
static task_root_type * pop_ready_task( task_root_type * volatile * const );
|
||||
|
||||
KOKKOS_FUNCTION static
|
||||
void decrement( task_root_type * task );
|
||||
@ -368,6 +365,7 @@ public:
|
||||
int16_t m_task_type ; ///< Type of task
|
||||
int16_t m_priority ; ///< Priority of runnable task
|
||||
|
||||
TaskBase() = delete ;
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
@ -375,17 +373,43 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
// Constructor for a runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase() noexcept
|
||||
: m_apply(0)
|
||||
, m_queue(0)
|
||||
, m_wait(0)
|
||||
, m_next(0)
|
||||
, m_ref_count(0)
|
||||
, m_alloc_size(0)
|
||||
, m_dep_count(0)
|
||||
, m_task_type( TaskSingle )
|
||||
, m_priority( 1 /* TaskRegularPriority */ )
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, TaskBase * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
) noexcept
|
||||
: m_apply( arg_apply )
|
||||
, m_queue( arg_queue )
|
||||
, m_wait( 0 )
|
||||
, m_next( arg_dependence )
|
||||
, m_ref_count( arg_ref_count )
|
||||
, m_alloc_size( arg_alloc_size )
|
||||
, m_dep_count( 0 )
|
||||
, m_task_type( arg_task_type )
|
||||
, m_priority( arg_priority )
|
||||
{}
|
||||
|
||||
// Constructor for an aggregate task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
constexpr TaskBase( queue_type * arg_queue
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_dep_count
|
||||
) noexcept
|
||||
: m_apply( 0 )
|
||||
, m_queue( arg_queue )
|
||||
, m_wait( 0 )
|
||||
, m_next( 0 )
|
||||
, m_ref_count( arg_ref_count )
|
||||
, m_alloc_size( arg_alloc_size )
|
||||
, m_dep_count( arg_dep_count )
|
||||
, m_task_type( Aggregate )
|
||||
, m_priority( 0 )
|
||||
{}
|
||||
|
||||
//----------------------------------------
|
||||
@ -406,9 +430,13 @@ public:
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void add_dependence( TaskBase* dep )
|
||||
{
|
||||
// Precondition: lock == m_next
|
||||
|
||||
TaskBase * const lock = (TaskBase *) LockTag ;
|
||||
|
||||
// Assign dependence to m_next. It will be processed in the subsequent
|
||||
// call to schedule. Error if the dependence is reset.
|
||||
if ( 0 != Kokkos::atomic_exchange( & m_next, dep ) ) {
|
||||
if ( lock != Kokkos::atomic_exchange( & m_next, dep ) ) {
|
||||
Kokkos::abort("TaskScheduler ERROR: resetting task dependence");
|
||||
}
|
||||
|
||||
@ -431,8 +459,13 @@ class TaskBase< ExecSpace , ResultType , void >
|
||||
{
|
||||
private:
|
||||
|
||||
static_assert( sizeof(TaskBase<ExecSpace,void,void>) == 48 , "" );
|
||||
using root_type = TaskBase<ExecSpace,void,void> ;
|
||||
using function_type = typename root_type::function_type ;
|
||||
using queue_type = typename root_type::queue_type ;
|
||||
|
||||
static_assert( sizeof(root_type) == 48 , "" );
|
||||
|
||||
TaskBase() = delete ;
|
||||
TaskBase( TaskBase && ) = delete ;
|
||||
TaskBase( const TaskBase & ) = delete ;
|
||||
TaskBase & operator = ( TaskBase && ) = delete ;
|
||||
@ -444,9 +477,24 @@ public:
|
||||
|
||||
KOKKOS_INLINE_FUNCTION ~TaskBase() = default ;
|
||||
|
||||
// Constructor for runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskBase()
|
||||
: TaskBase< ExecSpace , void , void >()
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, root_type * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
)
|
||||
: root_type( arg_apply
|
||||
, arg_queue
|
||||
, arg_dependence
|
||||
, arg_ref_count
|
||||
, arg_alloc_size
|
||||
, arg_task_type
|
||||
, arg_priority
|
||||
)
|
||||
, m_result()
|
||||
{}
|
||||
|
||||
@ -473,7 +521,10 @@ public:
|
||||
|
||||
using root_type = TaskBase< ExecSpace , void , void > ;
|
||||
using base_type = TaskBase< ExecSpace , ResultType , void > ;
|
||||
using member_type = TaskExec< ExecSpace > ;
|
||||
using specialization = TaskQueueSpecialization< ExecSpace > ;
|
||||
using function_type = typename root_type::function_type ;
|
||||
using queue_type = typename root_type::queue_type ;
|
||||
using member_type = typename specialization::member_type ;
|
||||
using functor_type = FunctorType ;
|
||||
using result_type = ResultType ;
|
||||
|
||||
@ -522,13 +573,30 @@ public:
|
||||
if ( 0 == member->team_rank() && !(task->requested_respawn()) ) {
|
||||
// Did not respawn, destroy the functor to free memory.
|
||||
static_cast<functor_type*>(task)->~functor_type();
|
||||
// Cannot destroy the task until its dependences have been processed.
|
||||
// Cannot destroy and deallocate the task until its dependences
|
||||
// have been processed.
|
||||
}
|
||||
}
|
||||
|
||||
// Constructor for runnable task
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
TaskBase( functor_type const & arg_functor )
|
||||
: base_type()
|
||||
constexpr TaskBase( function_type arg_apply
|
||||
, queue_type * arg_queue
|
||||
, root_type * arg_dependence
|
||||
, int arg_ref_count
|
||||
, int arg_alloc_size
|
||||
, int arg_task_type
|
||||
, int arg_priority
|
||||
, FunctorType && arg_functor
|
||||
)
|
||||
: base_type( arg_apply
|
||||
, arg_queue
|
||||
, arg_dependence
|
||||
, arg_ref_count
|
||||
, arg_alloc_size
|
||||
, arg_task_type
|
||||
, arg_priority
|
||||
)
|
||||
, functor_type( arg_functor )
|
||||
{}
|
||||
|
||||
|
||||
@ -170,6 +170,7 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
)
|
||||
{
|
||||
// Push task into a concurrently pushed and popped queue.
|
||||
// The queue can be either a ready task queue or a waiting task queue.
|
||||
// The queue is a linked list where 'task->m_next' form the links.
|
||||
// Fail the push attempt if the queue is locked;
|
||||
// otherwise retry until the push succeeds.
|
||||
@ -227,13 +228,12 @@ bool TaskQueue< ExecSpace >::push_task
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
typename TaskQueue< ExecSpace >::task_root_type *
|
||||
TaskQueue< ExecSpace >::pop_task
|
||||
TaskQueue< ExecSpace >::pop_ready_task
|
||||
( TaskQueue< ExecSpace >::task_root_type * volatile * const queue )
|
||||
{
|
||||
// Pop task from a concurrently pushed and popped queue.
|
||||
// Pop task from a concurrently pushed and popped ready task queue.
|
||||
// The queue is a linked list where 'task->m_next' form the links.
|
||||
|
||||
task_root_type * const zero = (task_root_type *) 0 ;
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
@ -252,42 +252,34 @@ TaskQueue< ExecSpace >::pop_task
|
||||
// (1) lock, (2) end, or (3) a valid task.
|
||||
// Thus zero will never appear in the queue.
|
||||
//
|
||||
// If queue is locked then just read by guaranteeing
|
||||
// the CAS will fail.
|
||||
// If queue is locked then just read by guaranteeing the CAS will fail.
|
||||
|
||||
if ( lock == task ) task = 0 ;
|
||||
|
||||
task_root_type * const x = task ;
|
||||
|
||||
task = Kokkos::atomic_compare_exchange(queue,task,lock);
|
||||
|
||||
if ( x == task ) break ; // CAS succeeded and queue is locked
|
||||
}
|
||||
|
||||
if ( end != task ) {
|
||||
task = Kokkos::atomic_compare_exchange(queue,x,lock);
|
||||
|
||||
if ( x == task ) {
|
||||
// CAS succeeded and queue is locked
|
||||
//
|
||||
// This thread has locked the queue and removed 'task' from the queue.
|
||||
// Extract the next entry of the queue from 'task->m_next'
|
||||
// and mark 'task' as popped from a queue by setting
|
||||
// 'task->m_next = lock'.
|
||||
|
||||
task_root_type * const next =
|
||||
Kokkos::atomic_exchange( & task->m_next , lock );
|
||||
|
||||
//
|
||||
// Place the next entry in the head of the queue,
|
||||
// which also unlocks the queue.
|
||||
//
|
||||
// This thread has exclusive access to
|
||||
// the queue and the popped task's m_next.
|
||||
|
||||
task_root_type * const unlock =
|
||||
Kokkos::atomic_exchange( queue , next );
|
||||
*queue = task->m_next ; task->m_next = lock ;
|
||||
|
||||
if ( next == zero || next == lock || lock != unlock ) {
|
||||
Kokkos::abort("TaskQueue::pop_task ERROR");
|
||||
}
|
||||
}
|
||||
Kokkos::memory_fence();
|
||||
|
||||
#if 0
|
||||
if ( end != task ) {
|
||||
printf( "pop_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
|
||||
printf( "pop_ready_task( 0x%lx 0x%lx { 0x%lx 0x%lx %d %d %d } )\n"
|
||||
, uintptr_t(queue)
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
@ -295,42 +287,166 @@ TaskQueue< ExecSpace >::pop_task
|
||||
, int(task->m_task_type)
|
||||
, int(task->m_priority)
|
||||
, int(task->m_ref_count) );
|
||||
}
|
||||
#endif
|
||||
|
||||
return task ;
|
||||
}
|
||||
}
|
||||
|
||||
return end ;
|
||||
}
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::schedule
|
||||
void TaskQueue< ExecSpace >::schedule_runnable
|
||||
( TaskQueue< ExecSpace >::task_root_type * const task )
|
||||
{
|
||||
// Schedule a runnable or when_all task upon construction / spawn
|
||||
// Schedule a runnable task upon construction / spawn
|
||||
// and upon completion of other tasks that 'task' is waiting on.
|
||||
|
||||
// Precondition on runnable task state:
|
||||
// task is either constructing or executing
|
||||
//
|
||||
// Precondition:
|
||||
// - called by a single thread for the input task
|
||||
// - calling thread has exclusive access to the task
|
||||
// - task is not a member of a queue
|
||||
// - if runnable then task is either constructing or respawning
|
||||
//
|
||||
// Constructing state:
|
||||
// task->m_wait == 0
|
||||
// task->m_next == dependence
|
||||
// Executing-respawn state:
|
||||
// task->m_wait == head of linked list
|
||||
// task->m_next == dependence
|
||||
// task->m_next == dependence or 0
|
||||
// Respawn state:
|
||||
// task->m_wait == head of linked list: 'end' or valid task
|
||||
// task->m_next == dependence or 0
|
||||
//
|
||||
// Task state transition:
|
||||
// Constructing -> Waiting
|
||||
// Executing-respawn -> Waiting
|
||||
// Respawn -> Waiting
|
||||
//
|
||||
// Postcondition on task state:
|
||||
// task->m_wait == head of linked list
|
||||
// task->m_next == member of linked list
|
||||
// task->m_wait == head of linked list (queue)
|
||||
// task->m_next == member of linked list (queue)
|
||||
|
||||
#if 0
|
||||
printf( "schedule( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
printf( "schedule_runnable( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
, uintptr_t(task->m_next)
|
||||
, task->m_task_type
|
||||
, task->m_priority
|
||||
, task->m_ref_count );
|
||||
#endif
|
||||
|
||||
task_root_type * const zero = (task_root_type *) 0 ;
|
||||
task_root_type * const lock = (task_root_type *) task_root_type::LockTag ;
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
bool respawn = false ;
|
||||
|
||||
//----------------------------------------
|
||||
|
||||
if ( zero == task->m_wait ) {
|
||||
// Task in Constructing state
|
||||
// - Transition to Waiting state
|
||||
// Preconditions:
|
||||
// - call occurs exclusively within a single thread
|
||||
|
||||
task->m_wait = end ;
|
||||
// Task in Waiting state
|
||||
}
|
||||
else if ( lock != task->m_wait ) {
|
||||
// Task in Executing state with Respawn request
|
||||
// - Update dependence
|
||||
// - Transition to Waiting state
|
||||
respawn = true ;
|
||||
}
|
||||
else {
|
||||
// Task in Complete state
|
||||
Kokkos::abort("TaskQueue::schedule_runnable ERROR: task is complete");
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Scheduling a runnable task which may have a depencency 'dep'.
|
||||
// Extract dependence, if any, from task->m_next.
|
||||
// If 'dep' is not null then attempt to push 'task'
|
||||
// into the wait queue of 'dep'.
|
||||
// If the push succeeds then 'task' may be
|
||||
// processed or executed by another thread at any time.
|
||||
// If the push fails then 'dep' is complete and 'task'
|
||||
// is ready to execute.
|
||||
|
||||
// Exclusive access so don't need an atomic exchange
|
||||
// task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
|
||||
task_root_type * dep = task->m_next ; task->m_next = zero ;
|
||||
|
||||
const bool is_ready =
|
||||
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
|
||||
|
||||
if ( ( 0 != dep ) && respawn ) {
|
||||
// Reference count for dep was incremented when
|
||||
// respawn assigned dependency to task->m_next
|
||||
// so that if dep completed prior to the
|
||||
// above push_task dep would not be destroyed.
|
||||
// dep reference count can now be decremented,
|
||||
// which may deallocate the task.
|
||||
TaskQueue::assign( & dep , (task_root_type *)0 );
|
||||
}
|
||||
|
||||
if ( is_ready ) {
|
||||
|
||||
// No dependence or 'dep' is complete so push task into ready queue.
|
||||
// Increment the ready count before pushing into ready queue
|
||||
// to track number of ready + executing tasks.
|
||||
// The ready count will be decremented when the task is complete.
|
||||
|
||||
Kokkos::atomic_increment( & m_ready_count );
|
||||
|
||||
task_root_type * volatile * const ready_queue =
|
||||
& m_ready[ task->m_priority ][ task->m_task_type ];
|
||||
|
||||
// A push_task fails if the ready queue is locked.
|
||||
// A ready queue is only locked during a push or pop;
|
||||
// i.e., it is never permanently locked.
|
||||
// Retry push to ready queue until it succeeds.
|
||||
// When the push succeeds then 'task' may be
|
||||
// processed or executed by another thread at any time.
|
||||
|
||||
while ( ! push_task( ready_queue , task ) );
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Postcondition:
|
||||
// - A runnable 'task' was pushed into a wait or ready queue.
|
||||
// - Concurrent execution may have already popped 'task'
|
||||
// from a queue and processed it as appropriate.
|
||||
}
|
||||
|
||||
template< typename ExecSpace >
|
||||
KOKKOS_FUNCTION
|
||||
void TaskQueue< ExecSpace >::schedule_aggregate
|
||||
( TaskQueue< ExecSpace >::task_root_type * const task )
|
||||
{
|
||||
// Schedule an aggregate task upon construction
|
||||
// and upon completion of other tasks that 'task' is waiting on.
|
||||
//
|
||||
// Precondition:
|
||||
// - called by a single thread for the input task
|
||||
// - calling thread has exclusive access to the task
|
||||
// - task is not a member of a queue
|
||||
//
|
||||
// Constructing state:
|
||||
// task->m_wait == 0
|
||||
// task->m_next == dependence or 0
|
||||
//
|
||||
// Task state transition:
|
||||
// Constructing -> Waiting
|
||||
//
|
||||
// Postcondition on task state:
|
||||
// task->m_wait == head of linked list (queue)
|
||||
// task->m_next == member of linked list (queue)
|
||||
|
||||
#if 0
|
||||
printf( "schedule_aggregate( 0x%lx { 0x%lx 0x%lx %d %d %d }\n"
|
||||
, uintptr_t(task)
|
||||
, uintptr_t(task->m_wait)
|
||||
, uintptr_t(task->m_next)
|
||||
@ -344,71 +460,22 @@ void TaskQueue< ExecSpace >::schedule
|
||||
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
|
||||
|
||||
//----------------------------------------
|
||||
{
|
||||
// If Constructing then task->m_wait == 0
|
||||
// Change to waiting by task->m_wait = EndTag
|
||||
|
||||
task_root_type * const init =
|
||||
Kokkos::atomic_compare_exchange( & task->m_wait , zero , end );
|
||||
if ( zero == task->m_wait ) {
|
||||
// Task in Constructing state
|
||||
// - Transition to Waiting state
|
||||
// Preconditions:
|
||||
// - call occurs exclusively within a single thread
|
||||
|
||||
// Precondition
|
||||
|
||||
if ( lock == init ) {
|
||||
Kokkos::abort("TaskQueue::schedule ERROR: task is complete");
|
||||
task->m_wait = end ;
|
||||
// Task in Waiting state
|
||||
}
|
||||
else if ( lock == task->m_wait ) {
|
||||
// Task in Complete state
|
||||
Kokkos::abort("TaskQueue::schedule_aggregate ERROR: task is complete");
|
||||
}
|
||||
|
||||
// if ( init == 0 ) Constructing -> Waiting
|
||||
// else Executing-Respawn -> Waiting
|
||||
}
|
||||
//----------------------------------------
|
||||
|
||||
if ( task_root_type::Aggregate != task->m_task_type ) {
|
||||
|
||||
// Scheduling a runnable task which may have a depencency 'dep'.
|
||||
// Extract dependence, if any, from task->m_next.
|
||||
// If 'dep' is not null then attempt to push 'task'
|
||||
// into the wait queue of 'dep'.
|
||||
// If the push succeeds then 'task' may be
|
||||
// processed or executed by another thread at any time.
|
||||
// If the push fails then 'dep' is complete and 'task'
|
||||
// is ready to execute.
|
||||
|
||||
task_root_type * dep = Kokkos::atomic_exchange( & task->m_next , zero );
|
||||
|
||||
const bool is_ready =
|
||||
( 0 == dep ) || ( ! push_task( & dep->m_wait , task ) );
|
||||
|
||||
// Reference count for dep was incremented when assigned
|
||||
// to task->m_next so that if it completed prior to the
|
||||
// above push_task dep would not be destroyed.
|
||||
// dep reference count can now be decremented,
|
||||
// which may deallocate the task.
|
||||
TaskQueue::assign( & dep , (task_root_type *)0 );
|
||||
|
||||
if ( is_ready ) {
|
||||
|
||||
// No dependence or 'dep' is complete so push task into ready queue.
|
||||
// Increment the ready count before pushing into ready queue
|
||||
// to track number of ready + executing tasks.
|
||||
// The ready count will be decremented when the task is complete.
|
||||
|
||||
Kokkos::atomic_increment( & m_ready_count );
|
||||
|
||||
task_root_type * volatile * const queue =
|
||||
& m_ready[ task->m_priority ][ task->m_task_type ];
|
||||
|
||||
// A push_task fails if the ready queue is locked.
|
||||
// A ready queue is only locked during a push or pop;
|
||||
// i.e., it is never permanently locked.
|
||||
// Retry push to ready queue until it succeeds.
|
||||
// When the push succeeds then 'task' may be
|
||||
// processed or executed by another thread at any time.
|
||||
|
||||
while ( ! push_task( queue , task ) );
|
||||
}
|
||||
}
|
||||
//----------------------------------------
|
||||
else {
|
||||
// Scheduling a 'when_all' task with multiple dependences.
|
||||
// This scheduling may be called when the 'when_all' is
|
||||
// (1) created or
|
||||
@ -432,7 +499,9 @@ void TaskQueue< ExecSpace >::schedule
|
||||
// The reference count of 'x' was incremented when
|
||||
// it was assigned into the dependence list.
|
||||
|
||||
task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
|
||||
// Exclusive access so don't need an atomic exchange
|
||||
// task_root_type * x = Kokkos::atomic_exchange( aggr + i , zero );
|
||||
task_root_type * x = aggr[i] ; aggr[i] = zero ;
|
||||
|
||||
if ( x ) {
|
||||
|
||||
@ -464,13 +533,11 @@ void TaskQueue< ExecSpace >::schedule
|
||||
|
||||
// '*task' may have been deleted upon completion
|
||||
}
|
||||
}
|
||||
|
||||
//----------------------------------------
|
||||
// Postcondition:
|
||||
// A runnable 'task' was pushed into a wait or ready queue.
|
||||
// An aggregate 'task' was either pushed to a wait queue
|
||||
// or completed.
|
||||
// Concurrent execution may have already popped 'task'
|
||||
// - An aggregate 'task' was either pushed to a wait queue or completed.
|
||||
// - Concurrent execution may have already popped 'task'
|
||||
// from a queue and processed it as appropriate.
|
||||
}
|
||||
|
||||
@ -529,7 +596,7 @@ void TaskQueue< ExecSpace >::complete
|
||||
// Is a runnable task has finished executing and requested respawn.
|
||||
// Schedule the task for subsequent execution.
|
||||
|
||||
schedule( task );
|
||||
schedule_runnable( task );
|
||||
}
|
||||
//----------------------------------------
|
||||
else {
|
||||
@ -556,18 +623,22 @@ void TaskQueue< ExecSpace >::complete
|
||||
TaskQueue::assign( & task , zero );
|
||||
|
||||
// This thread has exclusive access to the wait list so
|
||||
// the concurrency-safe pop_task function is not needed.
|
||||
// the concurrency-safe pop_ready_task function is not needed.
|
||||
// Schedule the tasks that have been waiting on the input 'task',
|
||||
// which may have been deleted.
|
||||
|
||||
while ( x != end ) {
|
||||
// Have exclusive access to 'x' until it is scheduled
|
||||
// Set x->m_next = zero <= no dependence, not a respawn
|
||||
|
||||
// Set x->m_next = zero <= no dependence
|
||||
task_root_type * const next = x->m_next ; x->m_next = 0 ;
|
||||
|
||||
task_root_type * const next =
|
||||
(task_root_type *) Kokkos::atomic_exchange( & x->m_next , zero );
|
||||
|
||||
schedule( x );
|
||||
if ( task_root_type::Aggregate != x->m_task_type ) {
|
||||
schedule_runnable( x );
|
||||
}
|
||||
else {
|
||||
schedule_aggregate( x );
|
||||
}
|
||||
|
||||
x = next ;
|
||||
}
|
||||
|
||||
@ -45,6 +45,7 @@
|
||||
#define KOKKOS_CORE_IMPL_UTILITIES_HPP
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
#include <stdint.h>
|
||||
#include <type_traits>
|
||||
|
||||
//----------------------------------------------------------------------------
|
||||
|
||||
@ -42,46 +42,138 @@
|
||||
*/
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#include <impl/Kokkos_spinwait.hpp>
|
||||
|
||||
#include <Kokkos_Atomic.hpp>
|
||||
#include <impl/Kokkos_BitOps.hpp>
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
#if ( KOKKOS_ENABLE_ASM )
|
||||
#if !defined( _WIN32 )
|
||||
#if defined( KOKKOS_ENABLE_ASM )
|
||||
#if defined( __arm__ ) || defined( __aarch64__ )
|
||||
/* No-operation instruction to idle the thread. */
|
||||
#define YIELD asm volatile("nop")
|
||||
#define KOKKOS_INTERNAL_PAUSE
|
||||
#else
|
||||
/* Pause instruction to prevent excess processor bus usage */
|
||||
#define YIELD asm volatile("pause\n":::"memory")
|
||||
#define KOKKOS_INTERNAL_PAUSE asm volatile("pause\n":::"memory")
|
||||
#endif
|
||||
#elif defined ( KOKKOS_ENABLE_WINTHREAD )
|
||||
#define KOKKOS_INTERNAL_NOP2 asm volatile("nop\n" "nop\n")
|
||||
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
|
||||
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
|
||||
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
|
||||
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned i ) noexcept {
|
||||
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
|
||||
case 0u: KOKKOS_INTERNAL_NOP2; break;
|
||||
case 1u: KOKKOS_INTERNAL_NOP4; break;
|
||||
case 2u: KOKKOS_INTERNAL_NOP8; break;
|
||||
case 3u: KOKKOS_INTERNAL_NOP16; break;
|
||||
default: KOKKOS_INTERNAL_NOP32;
|
||||
}
|
||||
KOKKOS_INTERNAL_PAUSE;
|
||||
}
|
||||
}
|
||||
#else
|
||||
#include <sched.h>
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
sched_yield();
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#else // defined( _WIN32 )
|
||||
#if defined ( KOKKOS_ENABLE_WINTHREAD )
|
||||
#include <process.h>
|
||||
#define YIELD Sleep(0)
|
||||
#elif defined ( _WIN32) && defined (_MSC_VER)
|
||||
/* Windows w/ Visual Studio */
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
Sleep(0);
|
||||
}
|
||||
}
|
||||
#elif defined( _MSC_VER )
|
||||
#define NOMINMAX
|
||||
#include <winsock2.h>
|
||||
#include <windows.h>
|
||||
#define YIELD YieldProcessor();
|
||||
#elif defined ( _WIN32 )
|
||||
/* Windows w/ Intel*/
|
||||
#define YIELD __asm__ __volatile__("pause\n":::"memory")
|
||||
#else
|
||||
#include <sched.h>
|
||||
#define YIELD sched_yield()
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned ) noexcept {
|
||||
YieldProcessor();
|
||||
}
|
||||
}
|
||||
#else
|
||||
#define KOKKOS_INTERNAL_PAUSE __asm__ __volatile__("pause\n":::"memory")
|
||||
#define KOKKOS_INTERNAL_NOP2 __asm__ __volatile__("nop\n" "nop")
|
||||
#define KOKKOS_INTERNAL_NOP4 KOKKOS_INTERNAL_NOP2; KOKKOS_INTERNAL_NOP2
|
||||
#define KOKKOS_INTERNAL_NOP8 KOKKOS_INTERNAL_NOP4; KOKKOS_INTERNAL_NOP4;
|
||||
#define KOKKOS_INTERNAL_NOP16 KOKKOS_INTERNAL_NOP8; KOKKOS_INTERNAL_NOP8;
|
||||
#define KOKKOS_INTERNAL_NOP32 KOKKOS_INTERNAL_NOP16; KOKKOS_INTERNAL_NOP16;
|
||||
namespace {
|
||||
inline void kokkos_internal_yield( const unsigned i ) noexcept {
|
||||
switch (Kokkos::Impl::bit_scan_reverse((i >> 2)+1u)) {
|
||||
case 0: KOKKOS_INTERNAL_NOP2; break;
|
||||
case 1: KOKKOS_INTERNAL_NOP4; break;
|
||||
case 2: KOKKOS_INTERNAL_NOP8; break;
|
||||
case 3: KOKKOS_INTERNAL_NOP16; break;
|
||||
default: KOKKOS_INTERNAL_NOP32;
|
||||
}
|
||||
KOKKOS_INTERNAL_PAUSE;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
||||
/*--------------------------------------------------------------------------*/
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void spinwait( volatile int & flag , const int value )
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value == flag ) {
|
||||
YIELD ;
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value != flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value == flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value )
|
||||
{
|
||||
Kokkos::store_fence();
|
||||
unsigned i = 0;
|
||||
while ( value != flag ) {
|
||||
kokkos_internal_yield(i);
|
||||
++i;
|
||||
}
|
||||
Kokkos::load_fence();
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
} /* namespace Impl */
|
||||
|
||||
@ -47,14 +47,30 @@
|
||||
|
||||
#include <Kokkos_Macros.hpp>
|
||||
|
||||
#include <cstdint>
|
||||
|
||||
namespace Kokkos {
|
||||
namespace Impl {
|
||||
|
||||
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
|
||||
void spinwait( volatile int & flag , const int value );
|
||||
|
||||
void spinwait_while_equal( volatile int32_t & flag , const int32_t value );
|
||||
void spinwait_until_equal( volatile int32_t & flag , const int32_t value );
|
||||
|
||||
void spinwait_while_equal( volatile int64_t & flag , const int64_t value );
|
||||
void spinwait_until_equal( volatile int64_t & flag , const int64_t value );
|
||||
#else
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait( volatile int & , const int ) {}
|
||||
void spinwait_while_equal( volatile int32_t & , const int32_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_until_equal( volatile int32_t & , const int32_t ) {}
|
||||
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_while_equal( volatile int64_t & , const int64_t ) {}
|
||||
KOKKOS_INLINE_FUNCTION
|
||||
void spinwait_until_equal( volatile int64_t & , const int64_t ) {}
|
||||
|
||||
#endif
|
||||
|
||||
} /* namespace Impl */
|
||||
|
||||
@ -115,10 +115,31 @@ IF(Kokkos_ENABLE_OpenMP)
|
||||
)
|
||||
ENDIF()
|
||||
|
||||
IF(Kokkos_ENABLE_QTHREAD)
|
||||
IF(Kokkos_ENABLE_Qthreads)
|
||||
TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
UnitTest_Qthread
|
||||
SOURCES UnitTestMain.cpp TestQthread.cpp
|
||||
UnitTest_Qthreads
|
||||
SOURCES
|
||||
UnitTestMain.cpp
|
||||
qthreads/TestQthreads_Atomics.cpp
|
||||
qthreads/TestQthreads_Other.cpp
|
||||
qthreads/TestQthreads_Reductions.cpp
|
||||
qthreads/TestQthreads_SubView_a.cpp
|
||||
qthreads/TestQthreads_SubView_b.cpp
|
||||
qthreads/TestQthreads_SubView_c01.cpp
|
||||
qthreads/TestQthreads_SubView_c02.cpp
|
||||
qthreads/TestQthreads_SubView_c03.cpp
|
||||
qthreads/TestQthreads_SubView_c04.cpp
|
||||
qthreads/TestQthreads_SubView_c05.cpp
|
||||
qthreads/TestQthreads_SubView_c06.cpp
|
||||
qthreads/TestQthreads_SubView_c07.cpp
|
||||
qthreads/TestQthreads_SubView_c08.cpp
|
||||
qthreads/TestQthreads_SubView_c09.cpp
|
||||
qthreads/TestQthreads_SubView_c10.cpp
|
||||
qthreads/TestQthreads_SubView_c11.cpp
|
||||
qthreads/TestQthreads_SubView_c12.cpp
|
||||
qthreads/TestQthreads_Team.cpp
|
||||
qthreads/TestQthreads_ViewAPI_a.cpp
|
||||
qthreads/TestQthreads_ViewAPI_b.cpp
|
||||
COMM serial mpi
|
||||
NUM_MPI_PROCS 1
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
@ -194,4 +215,3 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST(
|
||||
FAIL_REGULAR_EXPRESSION " FAILED "
|
||||
TESTONLYLIBS kokkos_gtest
|
||||
)
|
||||
|
||||
|
||||
@ -6,6 +6,7 @@ vpath %.cpp ${KOKKOS_PATH}/core/unit_test
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/serial
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/threads
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/openmp
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/qthreads
|
||||
vpath %.cpp ${KOKKOS_PATH}/core/unit_test/cuda
|
||||
|
||||
TEST_HEADERS = $(wildcard $(KOKKOS_PATH)/core/unit_test/*.hpp)
|
||||
@ -78,6 +79,22 @@ endif
|
||||
TEST_TARGETS += test-openmp
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1)
|
||||
OBJ_QTHREADS = TestQthreads_Other.o TestQthreads_Reductions.o TestQthreads_Atomics.o TestQthreads_Team.o
|
||||
OBJ_QTHREADS += TestQthreads_SubView_a.o TestQthreads_SubView_b.o
|
||||
ifeq ($(KOKKOS_INTERNAL_COMPILER_XL), 1)
|
||||
OBJ_QTHREADS += TestQthreads_SubView_c_all.o
|
||||
else
|
||||
OBJ_QTHREADS += TestQthreads_SubView_c01.o TestQthreads_SubView_c02.o TestQthreads_SubView_c03.o
|
||||
OBJ_QTHREADS += TestQthreads_SubView_c04.o TestQthreads_SubView_c05.o TestQthreads_SubView_c06.o
|
||||
OBJ_QTHREADS += TestQthreads_SubView_c07.o TestQthreads_SubView_c08.o TestQthreads_SubView_c09.o
|
||||
OBJ_QTHREADS += TestQthreads_SubView_c10.o TestQthreads_SubView_c11.o TestQthreads_SubView_c12.o
|
||||
endif
|
||||
OBJ_QTHREADS += TestQthreads_ViewAPI_a.o TestQthreads_ViewAPI_b.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosCore_UnitTest_Qthreads
|
||||
TEST_TARGETS += test-qthreads
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1)
|
||||
OBJ_SERIAL = TestSerial_Other.o TestSerial_Reductions.o TestSerial_Atomics.o TestSerial_Team.o
|
||||
OBJ_SERIAL += TestSerial_SubView_a.o TestSerial_SubView_b.o
|
||||
@ -94,12 +111,6 @@ endif
|
||||
TEST_TARGETS += test-serial
|
||||
endif
|
||||
|
||||
ifeq ($(KOKKOS_INTERNAL_USE_QTHREAD), 1)
|
||||
OBJ_QTHREAD = TestQthread.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosCore_UnitTest_Qthread
|
||||
TEST_TARGETS += test-qthread
|
||||
endif
|
||||
|
||||
OBJ_HWLOC = TestHWLOC.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosCore_UnitTest_HWLOC
|
||||
TEST_TARGETS += test-hwloc
|
||||
@ -115,10 +126,6 @@ TARGETS += ${INITTESTS_TARGETS}
|
||||
INITTESTS_TEST_TARGETS := $(addprefix test-default-init-,${INITTESTS_NUMBERS})
|
||||
TEST_TARGETS += ${INITTESTS_TEST_TARGETS}
|
||||
|
||||
OBJ_SYNCHRONIC = TestSynchronic.o UnitTestMain.o gtest-all.o
|
||||
TARGETS += KokkosCore_UnitTest_Synchronic
|
||||
TEST_TARGETS += test-synchronic
|
||||
|
||||
KokkosCore_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Cuda
|
||||
|
||||
@ -131,8 +138,8 @@ KokkosCore_UnitTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS)
|
||||
KokkosCore_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SERIAL) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Serial
|
||||
|
||||
KokkosCore_UnitTest_Qthread: $(OBJ_QTHREAD) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREAD) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthread
|
||||
KokkosCore_UnitTest_Qthreads: $(OBJ_QTHREADS) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_QTHREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Qthreads
|
||||
|
||||
KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_HWLOC
|
||||
@ -146,9 +153,6 @@ KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS)
|
||||
${INITTESTS_TARGETS}: KokkosCore_UnitTest_DefaultDeviceTypeInit_%: TestDefaultDeviceTypeInit_%.o UnitTestMain.o gtest-all.o $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) TestDefaultDeviceTypeInit_$*.o UnitTestMain.o gtest-all.o $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
|
||||
|
||||
KokkosCore_UnitTest_Synchronic: $(OBJ_SYNCHRONIC) $(KOKKOS_LINK_DEPENDS)
|
||||
$(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_SYNCHRONIC) $(KOKKOS_LIBS) $(LIB) -o KokkosCore_UnitTest_Synchronic
|
||||
|
||||
test-cuda: KokkosCore_UnitTest_Cuda
|
||||
./KokkosCore_UnitTest_Cuda
|
||||
|
||||
@ -161,8 +165,8 @@ test-openmp: KokkosCore_UnitTest_OpenMP
|
||||
test-serial: KokkosCore_UnitTest_Serial
|
||||
./KokkosCore_UnitTest_Serial
|
||||
|
||||
test-qthread: KokkosCore_UnitTest_Qthread
|
||||
./KokkosCore_UnitTest_Qthread
|
||||
test-qthreads: KokkosCore_UnitTest_Qthreads
|
||||
./KokkosCore_UnitTest_Qthreads
|
||||
|
||||
test-hwloc: KokkosCore_UnitTest_HWLOC
|
||||
./KokkosCore_UnitTest_HWLOC
|
||||
@ -176,9 +180,6 @@ test-default: KokkosCore_UnitTest_Default
|
||||
${INITTESTS_TEST_TARGETS}: test-default-init-%: KokkosCore_UnitTest_DefaultDeviceTypeInit_%
|
||||
./KokkosCore_UnitTest_DefaultDeviceTypeInit_$*
|
||||
|
||||
test-synchronic: KokkosCore_UnitTest_Synchronic
|
||||
./KokkosCore_UnitTest_Synchronic
|
||||
|
||||
build_all: $(TARGETS)
|
||||
|
||||
test: $(TEST_TARGETS)
|
||||
@ -193,4 +194,3 @@ clean: kokkos-clean
|
||||
|
||||
gtest-all.o:$(GTEST_PATH)/gtest/gtest-all.cc
|
||||
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $(GTEST_PATH)/gtest/gtest-all.cc
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user