Kokkos lib update

This commit is contained in:
Steve Plimpton
2016-09-08 13:56:18 -06:00
parent 0252347d43
commit 236ebf7fab
212 changed files with 18902 additions and 13466 deletions

View File

@ -1,4 +1,15 @@
IF(COMMAND TRIBITS_PACKAGE_DECL)
SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "")
ELSE()
SET(KOKKOS_HAS_TRILINOS OFF CACHE BOOL "")
ENDIF()
IF(NOT KOKKOS_HAS_TRILINOS)
CMAKE_MINIMUM_REQUIRED(VERSION 2.8.11 FATAL_ERROR)
INCLUDE(cmake/tribits.cmake)
ENDIF()
# #
# A) Forward delcare the package so that certain options are also defined for # A) Forward delcare the package so that certain options are also defined for
# subpackages # subpackages
@ -12,7 +23,22 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS)
# subpackages as well. # subpackages as well.
# #
TRIBITS_ADD_DEBUG_OPTION()
# mfh 01 Aug 2016: See Issue #61:
#
# https://github.com/kokkos/kokkos/issues/61
#
# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines
# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead,
# for compatibility with Kokkos' Makefile build system.
TRIBITS_ADD_OPTION_AND_DEFINE(
${PACKAGE_NAME}_ENABLE_DEBUG
${PACKAGE_NAME_UC}_HAVE_DEBUG
"Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build."
${${PROJECT_NAME}_ENABLE_DEBUG}
)
TRIBITS_ADD_OPTION_AND_DEFINE( TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_SIERRA_BUILD Kokkos_ENABLE_SIERRA_BUILD
@ -82,11 +108,33 @@ TRIBITS_ADD_OPTION_AND_DEFINE(
"${TPL_ENABLE_MPI}" "${TPL_ENABLE_MPI}"
) )
# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option
#
# CMake is case sensitive. The Kokkos_ENABLE_Debug_Bounds_Check
# option (defined below) is annoyingly not all caps, but we need to
# keep it that way for backwards compatibility. If users forget and
# try using an all-caps variable, then make it count by using the
# all-caps version as the default value of the original, not-all-caps
# option. Otherwise, the default value of this option comes from
# Kokkos_ENABLE_DEBUG (see Issue #367).
ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG)
IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK)
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON)
ELSE()
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
ENDIF()
ELSE()
SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}")
ENDIF()
ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT)
TRIBITS_ADD_OPTION_AND_DEFINE( TRIBITS_ADD_OPTION_AND_DEFINE(
Kokkos_ENABLE_Debug_Bounds_Check Kokkos_ENABLE_Debug_Bounds_Check
KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
"Enable bounds checking support in Kokkos." "Enable Kokkos::View run-time bounds checking."
OFF "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}"
) )
TRIBITS_ADD_OPTION_AND_DEFINE( TRIBITS_ADD_OPTION_AND_DEFINE(

View File

@ -7,7 +7,7 @@ CXXFLAGS=$(CCFLAGS)
#Options: OpenMP,Serial,Pthreads,Cuda #Options: OpenMP,Serial,Pthreads,Cuda
KOKKOS_DEVICES ?= "OpenMP" KOKKOS_DEVICES ?= "OpenMP"
#KOKKOS_DEVICES ?= "Pthreads" #KOKKOS_DEVICES ?= "Pthreads"
#Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,ARMv8,BGQ,Power7,Power8,KNL #Options: KNC,SNB,HSW,Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal61,ARMv8,BGQ,Power7,Power8,KNL,BDW
KOKKOS_ARCH ?= "" KOKKOS_ARCH ?= ""
#Options: yes,no #Options: yes,no
KOKKOS_DEBUG ?= "no" KOKKOS_DEBUG ?= "no"
@ -97,6 +97,7 @@ KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda |
KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l))
KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l))
KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l))
KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l)) KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l))
#NVIDIA based #NVIDIA based
@ -108,10 +109,12 @@ KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep
KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l)) KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l))
KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -123,6 +126,7 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_AR
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \
+ $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \
+ $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \
+ $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc))
@ -142,11 +146,11 @@ KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AM
#Any AVX? #Any AVX?
KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc ))
KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
# Decide what ISA level we are able to support # Decide what ISA level we are able to support
KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc ))
KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc ))
KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc )) KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8) | bc ))
@ -304,8 +308,8 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -mcpu=power8 KOKKOS_CXXFLAGS += -mcpu=power8 -mtune=power8
KOKKOS_LDFLAGS += -mcpu=power8 KOKKOS_LDFLAGS += -mcpu=power8 -mtune=power8
endif endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
@ -321,8 +325,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX2), 1)
else else
# Assume that this is a really a GNU compiler # Assume that this is a really a GNU compiler
KOKKOS_CXXFLAGS += -march=core-avx2 KOKKOS_CXXFLAGS += -march=core-avx2 -mtune=core-avx2
KOKKOS_LDFLAGS += -march=core-avx2 KOKKOS_LDFLAGS += -march=core-avx2 -mtune=core-avx2
endif endif
endif endif
endif endif
@ -390,6 +394,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp ) tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_53 KOKKOS_CXXFLAGS += -arch=sm_53
endif endif
ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1)
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp )
tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp )
KOKKOS_CXXFLAGS += -arch=sm_61
endif
endif endif
KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h) KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h)

View File

@ -1,9 +1,5 @@
Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp Kokkos_UnorderedMap_impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/containers/src/impl/Kokkos_UnorderedMap_impl.cpp
Kokkos_AllocationTracker.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_AllocationTracker.cpp
Kokkos_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_BasicAllocators.cpp
Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp Kokkos_Core.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Core.cpp
Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp Kokkos_CPUDiscovery.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_CPUDiscovery.cpp
@ -20,6 +16,10 @@ Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Seria
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial.cpp
Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp Kokkos_Serial_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_TaskPolicy.cpp
Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp
Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Serial_Task.cpp
Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp Kokkos_Shape.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Shape.cpp
Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp Kokkos_spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_spinwait.cpp
@ -32,12 +32,12 @@ Kokkos_MemoryPool.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_M
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemoryPool.cpp
ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1)
Kokkos_Cuda_BasicAllocators.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_BasicAllocators.cpp
Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp Kokkos_Cuda_Impl.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Impl.cpp
Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp
Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp
Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp Kokkos_Cuda_TaskPolicy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_TaskPolicy.cpp
endif endif
@ -61,6 +61,8 @@ endif
ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1)
Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp Kokkos_OpenMPexec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMPexec.cpp
Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
$(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp
endif endif
Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp Kokkos_HBWSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HBWSpace.cpp

View File

@ -37,7 +37,7 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov
====Requirements============================================================ ====Requirements============================================================
============================================================================ ============================================================================
Primary tested compilers are: Primary tested compilers on X86 are:
GCC 4.7.2 GCC 4.7.2
GCC 4.8.4 GCC 4.8.4
GCC 4.9.2 GCC 4.9.2
@ -48,26 +48,43 @@ Primary tested compilers are:
Clang 3.5.2 Clang 3.5.2
Clang 3.6.1 Clang 3.6.1
Primary tested compilers on Power 8 are:
IBM XL 13.1.3 (OpenMP,Serial)
GCC 4.9.2 (OpenMP,Serial)
GCC 5.3.0 (OpenMP,Serial)
Secondary tested compilers are: Secondary tested compilers are:
CUDA 6.5 (with gcc 4.7.2) CUDA 6.5 (with gcc 4.7.2)
CUDA 7.0 (with gcc 4.7.2) CUDA 7.0 (with gcc 4.7.2)
CUDA 7.5 (with gcc 4.8.4) CUDA 7.5 (with gcc 4.8.4)
Other compilers working: Other compilers working:
X86:
Intel 17.0.042 (the FENL example causes internal compiler error)
PGI 15.4 PGI 15.4
IBM XL 13.1.2
Cygwin 2.1.0 64bit with gcc 4.9.3 Cygwin 2.1.0 64bit with gcc 4.9.3
KNL:
Intel 16.2.181 (the FENL example causes internal compiler error)
Intel 17.0.042 (the FENL example causes internal compiler error)
Known non-working combinations:
Power8:
GCC 6.1.0
Pthreads backend
Primary tested compiler are passing in release mode Primary tested compiler are passing in release mode
with warnings as errors. We are using the following set with warnings as errors. They also are tested with a comprehensive set of
of flags: backend combinations (i.e. OpenMP, Pthreads, Serial, OpenMP+Serial, ...).
We are using the following set of flags:
GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits
-Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized
Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized
Secondary compilers are passing without -Werror. Secondary compilers are passing without -Werror.
Other compilers are tested occasionally. Other compilers are tested occasionally, in particular when pushing from develop to
master branch, without -Werror and only for a select set of backends.
============================================================================ ============================================================================
====Getting started========================================================= ====Getting started=========================================================

View File

@ -771,6 +771,7 @@ namespace Kokkos {
friend class Random_XorShift1024_Pool<DeviceType>; friend class Random_XorShift1024_Pool<DeviceType>;
public: public:
typedef Random_XorShift1024_Pool<DeviceType> pool_type;
typedef DeviceType device_type; typedef DeviceType device_type;
enum {MAX_URAND = 0xffffffffU}; enum {MAX_URAND = 0xffffffffU};
@ -779,10 +780,10 @@ namespace Kokkos {
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx){ p_(p),state_idx_(state_idx){
for(int i=0 ; i<16; i++) for(int i=0 ; i<16; i++)
state_[i] = state[i]; state_[i] = state(state_idx,i);
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -933,6 +934,7 @@ namespace Kokkos {
state_data_type state_; state_data_type state_;
int_view_type p_; int_view_type p_;
int num_states_; int num_states_;
friend class Random_XorShift1024<DeviceType>;
public: public:
typedef Random_XorShift1024<DeviceType> generator_type; typedef Random_XorShift1024<DeviceType> generator_type;
@ -1001,7 +1003,7 @@ namespace Kokkos {
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
Random_XorShift1024<DeviceType> get_state() const { Random_XorShift1024<DeviceType> get_state() const {
const int i = DeviceType::hardware_thread_id(); const int i = DeviceType::hardware_thread_id();
return Random_XorShift1024<DeviceType>(&state_(i,0),p_(i),i); return Random_XorShift1024<DeviceType>(state_,p_(i),i);
}; };
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -1020,10 +1022,12 @@ namespace Kokkos {
int p_; int p_;
const int state_idx_; const int state_idx_;
uint64_t* state_; uint64_t* state_;
const int stride_;
friend class Random_XorShift1024_Pool<Kokkos::Cuda>; friend class Random_XorShift1024_Pool<Kokkos::Cuda>;
public: public:
typedef Kokkos::Cuda device_type; typedef Kokkos::Cuda device_type;
typedef Random_XorShift1024_Pool<device_type> pool_type;
enum {MAX_URAND = 0xffffffffU}; enum {MAX_URAND = 0xffffffffU};
enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; enum {MAX_URAND64 = 0xffffffffffffffffULL-1};
@ -1031,30 +1035,30 @@ namespace Kokkos {
enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)}; enum {MAX_RAND64 = static_cast<int64_t>(0xffffffffffffffffULL/2-1)};
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
Random_XorShift1024 (uint64_t* state, int p, int state_idx = 0): Random_XorShift1024 (const typename pool_type::state_data_type& state, int p, int state_idx = 0):
p_(p),state_idx_(state_idx),state_(state){ p_(p),state_idx_(state_idx),state_(&state(state_idx,0)),stride_(state.stride_1()){
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
uint32_t urand() { uint32_t urand() {
uint64_t state_0 = state_[ p_ ]; uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31; state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11; state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30; state_0 ^= state_0 >> 30;
uint64_t tmp = ( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL; uint64_t tmp = ( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981ULL;
tmp = tmp>>16; tmp = tmp>>16;
return static_cast<uint32_t>(tmp&MAX_URAND); return static_cast<uint32_t>(tmp&MAX_URAND);
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
uint64_t urand64() { uint64_t urand64() {
uint64_t state_0 = state_[ p_ ]; uint64_t state_0 = state_[ p_ * stride_ ];
uint64_t state_1 = state_[ p_ = ( p_ + 1 ) & 15 ]; uint64_t state_1 = state_[ (p_ = ( p_ + 1 ) & 15) * stride_ ];
state_1 ^= state_1 << 31; state_1 ^= state_1 << 31;
state_1 ^= state_1 >> 11; state_1 ^= state_1 >> 11;
state_0 ^= state_0 >> 30; state_0 ^= state_0 >> 30;
return (( state_[ p_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1; return (( state_[ p_ * stride_ ] = state_0 ^ state_1 ) * 1181783497276652981LL) - 1;
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
@ -1227,9 +1231,9 @@ Random_XorShift1024<Kokkos::Cuda> Random_XorShift1024_Pool<Kokkos::Cuda>::get_st
if(i>=num_states_) {i = i_offset;} if(i>=num_states_) {i = i_offset;}
} }
return Random_XorShift1024<Kokkos::Cuda>(&state_(i,0), p_(i), i); return Random_XorShift1024<Kokkos::Cuda>(state_, p_(i), i);
#else #else
return Random_XorShift1024<Kokkos::Cuda>(&state_(0,0), p_(0), 0); return Random_XorShift1024<Kokkos::Cuda>(state_, p_(0), 0);
#endif #endif
} }
@ -1248,14 +1252,15 @@ void Random_XorShift1024_Pool<Kokkos::Cuda>::free_state(const Random_XorShift102
#endif #endif
namespace Impl {
template<class ViewType, class RandomPool, int loops, int rank> template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
struct fill_random_functor_range; struct fill_random_functor_range;
template<class ViewType, class RandomPool, int loops, int rank> template<class ViewType, class RandomPool, int loops, int rank, class IndexType>
struct fill_random_functor_begin_end; struct fill_random_functor_begin_end;
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,1>{ struct fill_random_functor_range<ViewType,RandomPool,loops,1,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1268,19 +1273,19 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,1>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (const IndexType& i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) if(idx<static_cast<IndexType>(a.dimension_0()))
a(idx) = Rand::draw(gen,range); a(idx) = Rand::draw(gen,range);
} }
rand_pool.free_state(gen); rand_pool.free_state(gen);
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,2>{ struct fill_random_functor_range<ViewType,RandomPool,loops,2,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1293,12 +1298,12 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
a(idx,k) = Rand::draw(gen,range); a(idx,k) = Rand::draw(gen,range);
} }
} }
@ -1307,8 +1312,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,2>{
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,3>{ struct fill_random_functor_range<ViewType,RandomPool,loops,3,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1321,13 +1326,13 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
a(idx,k,l) = Rand::draw(gen,range); a(idx,k,l) = Rand::draw(gen,range);
} }
} }
@ -1335,8 +1340,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,3>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,4>{ struct fill_random_functor_range<ViewType,RandomPool,loops,4, IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1349,14 +1354,14 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
a(idx,k,l,m) = Rand::draw(gen,range); a(idx,k,l,m) = Rand::draw(gen,range);
} }
} }
@ -1364,8 +1369,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,4>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,5>{ struct fill_random_functor_range<ViewType,RandomPool,loops,5,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1378,15 +1383,15 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
a(idx,k,l,m,n) = Rand::draw(gen,range); a(idx,k,l,m,n) = Rand::draw(gen,range);
} }
} }
@ -1394,8 +1399,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,5>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,6>{ struct fill_random_functor_range<ViewType,RandomPool,loops,6,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1408,16 +1413,16 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
a(idx,k,l,m,n,o) = Rand::draw(gen,range); a(idx,k,l,m,n,o) = Rand::draw(gen,range);
} }
} }
@ -1425,8 +1430,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,6>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,7>{ struct fill_random_functor_range<ViewType,RandomPool,loops,7,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1439,17 +1444,17 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(unsigned int p=0;p<a.dimension_6();p++) for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
a(idx,k,l,m,n,o,p) = Rand::draw(gen,range); a(idx,k,l,m,n,o,p) = Rand::draw(gen,range);
} }
} }
@ -1457,8 +1462,8 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,7>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_range<ViewType,RandomPool,loops,8>{ struct fill_random_functor_range<ViewType,RandomPool,loops,8,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1471,26 +1476,26 @@ struct fill_random_functor_range<ViewType,RandomPool,loops,8>{
a(a_),rand_pool(rand_pool_),range(range_) {} a(a_),rand_pool(rand_pool_),range(range_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(unsigned int p=0;p<a.dimension_6();p++) for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
for(unsigned int q=0;q<a.dimension_7();q++) for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range); a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,range);
} }
} }
rand_pool.free_state(gen); rand_pool.free_state(gen);
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1503,19 +1508,19 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,1>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) if(idx<static_cast<IndexType>(a.dimension_0()))
a(idx) = Rand::draw(gen,begin,end); a(idx) = Rand::draw(gen,begin,end);
} }
rand_pool.free_state(gen); rand_pool.free_state(gen);
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1528,12 +1533,12 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
a(idx,k) = Rand::draw(gen,begin,end); a(idx,k) = Rand::draw(gen,begin,end);
} }
} }
@ -1542,8 +1547,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,2>{
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1556,13 +1561,13 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
a(idx,k,l) = Rand::draw(gen,begin,end); a(idx,k,l) = Rand::draw(gen,begin,end);
} }
} }
@ -1570,8 +1575,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,3>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1584,14 +1589,14 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
a(idx,k,l,m) = Rand::draw(gen,begin,end); a(idx,k,l,m) = Rand::draw(gen,begin,end);
} }
} }
@ -1599,8 +1604,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,4>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1613,15 +1618,15 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()){ if(idx<static_cast<IndexType>(a.dimension_0())){
for(unsigned int l=0;l<a.dimension_1();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_1());l++)
for(unsigned int m=0;m<a.dimension_2();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_2());m++)
for(unsigned int n=0;n<a.dimension_3();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_3());n++)
for(unsigned int o=0;o<a.dimension_4();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_4());o++)
a(idx,l,m,n,o) = Rand::draw(gen,begin,end); a(idx,l,m,n,o) = Rand::draw(gen,begin,end);
} }
} }
@ -1629,8 +1634,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,5>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1643,16 +1648,16 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end); a(idx,k,l,m,n,o) = Rand::draw(gen,begin,end);
} }
} }
@ -1661,8 +1666,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,6>{
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1675,17 +1680,17 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(unsigned int p=0;p<a.dimension_6();p++) for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end); a(idx,k,l,m,n,o,p) = Rand::draw(gen,begin,end);
} }
} }
@ -1693,8 +1698,8 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,7>{
} }
}; };
template<class ViewType, class RandomPool, int loops> template<class ViewType, class RandomPool, int loops, class IndexType>
struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8,IndexType>{
typedef typename ViewType::execution_space execution_space; typedef typename ViewType::execution_space execution_space;
ViewType a; ViewType a;
RandomPool rand_pool; RandomPool rand_pool;
@ -1707,18 +1712,18 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {} a(a_),rand_pool(rand_pool_),begin(begin_),end(end_) {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator() (unsigned int i) const { void operator() (IndexType i) const {
typename RandomPool::generator_type gen = rand_pool.get_state(); typename RandomPool::generator_type gen = rand_pool.get_state();
for(unsigned int j=0;j<loops;j++) { for(IndexType j=0;j<loops;j++) {
const uint64_t idx = i*loops+j; const IndexType idx = i*loops+j;
if(idx<a.dimension_0()) { if(idx<static_cast<IndexType>(a.dimension_0())) {
for(unsigned int k=0;k<a.dimension_1();k++) for(IndexType k=0;k<static_cast<IndexType>(a.dimension_1());k++)
for(unsigned int l=0;l<a.dimension_2();l++) for(IndexType l=0;l<static_cast<IndexType>(a.dimension_2());l++)
for(unsigned int m=0;m<a.dimension_3();m++) for(IndexType m=0;m<static_cast<IndexType>(a.dimension_3());m++)
for(unsigned int n=0;n<a.dimension_4();n++) for(IndexType n=0;n<static_cast<IndexType>(a.dimension_4());n++)
for(unsigned int o=0;o<a.dimension_5();o++) for(IndexType o=0;o<static_cast<IndexType>(a.dimension_5());o++)
for(unsigned int p=0;p<a.dimension_6();p++) for(IndexType p=0;p<static_cast<IndexType>(a.dimension_6());p++)
for(unsigned int q=0;q<a.dimension_7();q++) for(IndexType q=0;q<static_cast<IndexType>(a.dimension_7());q++)
a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end); a(idx,k,l,m,n,o,p,q) = Rand::draw(gen,begin,end);
} }
} }
@ -1726,18 +1731,20 @@ struct fill_random_functor_begin_end<ViewType,RandomPool,loops,8>{
} }
}; };
template<class ViewType, class RandomPool> }
template<class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) { void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type range) {
int64_t LDA = a.dimension_0(); int64_t LDA = a.dimension_0();
if(LDA>0) if(LDA>0)
parallel_for((LDA+127)/128,fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank>(a,g,range)); parallel_for((LDA+127)/128,Impl::fill_random_functor_range<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,range));
} }
template<class ViewType, class RandomPool> template<class ViewType, class RandomPool, class IndexType = int64_t>
void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) { void fill_random(ViewType a, RandomPool g, typename ViewType::const_value_type begin,typename ViewType::const_value_type end ) {
int64_t LDA = a.dimension_0(); int64_t LDA = a.dimension_0();
if(LDA>0) if(LDA>0)
parallel_for((LDA+127)/128,fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank>(a,g,begin,end)); parallel_for((LDA+127)/128,Impl::fill_random_functor_begin_end<ViewType,RandomPool,128,ViewType::Rank,IndexType>(a,g,begin,end));
} }
} }

View File

@ -50,6 +50,7 @@
#include <Kokkos_Core.hpp> #include <Kokkos_Core.hpp>
#include <Kokkos_Random.hpp> #include <Kokkos_Random.hpp>
#include <cmath> #include <cmath>
#include <chrono>
namespace Test { namespace Test {
@ -207,7 +208,6 @@ struct test_histogram1d_functor {
density_1d (d1d), density_1d (d1d),
mean (1.0*num_draws/HIST_DIM1D*3) mean (1.0*num_draws/HIST_DIM1D*3)
{ {
printf ("Mean: %e\n", mean);
} }
KOKKOS_INLINE_FUNCTION void KOKKOS_INLINE_FUNCTION void
@ -295,7 +295,7 @@ struct test_random_scalar {
parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result); parallel_reduce (num_draws/1024, functor_type (pool, density_1d, density_3d), result);
//printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2); //printf("Result: %lf %lf %lf\n",result.mean/num_draws/3,result.variance/num_draws/3,result.covariance/num_draws/2);
double tolerance = 2.0*sqrt(1.0/num_draws); double tolerance = 1.6*sqrt(1.0/num_draws);
double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max(); double mean_expect = 0.5*Kokkos::rand<rnd_type,Scalar>::max();
double variance_expect = 1.0/3.0*mean_expect*mean_expect; double variance_expect = 1.0/3.0*mean_expect*mean_expect;
double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0; double mean_eps = mean_expect/(result.mean/num_draws/3)-1.0;
@ -303,10 +303,10 @@ struct test_random_scalar {
double covariance_eps = result.covariance/num_draws/2/variance_expect; double covariance_eps = result.covariance/num_draws/2/variance_expect;
pass_mean = ((-tolerance < mean_eps) && pass_mean = ((-tolerance < mean_eps) &&
( tolerance > mean_eps)) ? 1:0; ( tolerance > mean_eps)) ? 1:0;
pass_var = ((-tolerance < variance_eps) && pass_var = ((-1.5*tolerance < variance_eps) &&
( tolerance > variance_eps)) ? 1:0; ( 1.5*tolerance > variance_eps)) ? 1:0;
pass_covar = ((-1.4*tolerance < covariance_eps) && pass_covar = ((-2.0*tolerance < covariance_eps) &&
( 1.4*tolerance > covariance_eps)) ? 1:0; ( 2.0*tolerance > covariance_eps)) ? 1:0;
cerr << "Pass: " << pass_mean cerr << "Pass: " << pass_mean
<< " " << pass_var << " " << pass_var
<< " " << mean_eps << " " << mean_eps
@ -328,12 +328,12 @@ struct test_random_scalar {
double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0; double mean_eps = mean_expect/(result.mean/HIST_DIM1D)-1.0;
double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0; double variance_eps = variance_expect/(result.variance/HIST_DIM1D)-1.0;
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
pass_hist1d_mean = ((-tolerance < mean_eps) && pass_hist1d_mean = ((-0.0001 < mean_eps) &&
( tolerance > mean_eps)) ? 1:0; ( 0.0001 > mean_eps)) ? 1:0;
pass_hist1d_var = ((-tolerance < variance_eps) && pass_hist1d_var = ((-0.07 < variance_eps) &&
( tolerance > variance_eps)) ? 1:0; ( 0.07 > variance_eps)) ? 1:0;
pass_hist1d_covar = ((-tolerance < covariance_eps) && pass_hist1d_covar = ((-0.06 < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0; ( 0.06 > covariance_eps)) ? 1:0;
cerr << "Density 1D: " << mean_eps cerr << "Density 1D: " << mean_eps
<< " " << variance_eps << " " << variance_eps
@ -363,8 +363,8 @@ struct test_random_scalar {
double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect; double covariance_eps = (result.covariance/HIST_DIM1D - covariance_expect)/mean_expect;
pass_hist3d_mean = ((-tolerance < mean_eps) && pass_hist3d_mean = ((-tolerance < mean_eps) &&
( tolerance > mean_eps)) ? 1:0; ( tolerance > mean_eps)) ? 1:0;
pass_hist3d_var = ((-tolerance < variance_eps) && pass_hist3d_var = ((-1.2*tolerance < variance_eps) &&
( tolerance > variance_eps)) ? 1:0; ( 1.2*tolerance > variance_eps)) ? 1:0;
pass_hist3d_covar = ((-tolerance < covariance_eps) && pass_hist3d_covar = ((-tolerance < covariance_eps) &&
( tolerance > covariance_eps)) ? 1:0; ( tolerance > covariance_eps)) ? 1:0;
@ -386,8 +386,13 @@ void test_random(unsigned int num_draws)
typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d"); typename test_random_functor<RandomGenerator,int>::type_1d density_1d("D1d");
typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d"); typename test_random_functor<RandomGenerator,int>::type_3d density_3d("D3d");
uint64_t ticks = std::chrono::high_resolution_clock::now().time_since_epoch().count();
cerr << "Test Seed:" << ticks << endl;
RandomGenerator pool(ticks);
cerr << "Test Scalar=int" << endl; cerr << "Test Scalar=int" << endl;
RandomGenerator pool(31891);
test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws); test_random_scalar<RandomGenerator,int> test_int(density_1d,density_3d,pool,num_draws);
ASSERT_EQ( test_int.pass_mean,1); ASSERT_EQ( test_int.pass_mean,1);
ASSERT_EQ( test_int.pass_var,1); ASSERT_EQ( test_int.pass_var,1);

View File

@ -0,0 +1,79 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
# Check for CUDA support
SET(_CUDA_FAILURE OFF)
# Have CMake find CUDA
IF(NOT _CUDA_FAILURE)
FIND_PACKAGE(CUDA 3.2)
IF (NOT CUDA_FOUND)
SET(_CUDA_FAILURE ON)
ENDIF()
ENDIF()
IF(NOT _CUDA_FAILURE)
# if we haven't met failure
macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target)
TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY)
endmacro()
GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE})
GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY})
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
ELSE()
SET(TPL_ENABLE_CUDA OFF)
ENDIF()

View File

@ -0,0 +1,64 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
include(${TRIBITS_DEPS_DIR}/CUDA.cmake)
IF (TPL_ENABLE_CUDA)
GLOBAL_SET(TPL_CUSPARSE_LIBRARY_DIRS)
GLOBAL_SET(TPL_CUSPARSE_INCLUDE_DIRS ${TPL_CUDA_INCLUDE_DIRS})
GLOBAL_SET(TPL_CUSPARSE_LIBRARIES ${CUDA_cusparse_LIBRARY})
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(CUSPARSE)
ENDIF()

View File

@ -0,0 +1,70 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: November 2011
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: http://www.open-mpi.org/projects/hwloc/
# Version: 1.3
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC
REQUIRED_HEADERS hwloc.h
REQUIRED_LIBS_NAMES "hwloc"
)

View File

@ -0,0 +1,83 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
SET(USE_THREADS FALSE)
IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES)
# Use CMake's Thread finder since it is a bit smarter in determining
# whether pthreads is already built into the compiler and doesn't need
# a library to link.
FIND_PACKAGE(Threads)
#If Threads found a copy of pthreads make sure it is one of the cases the tribits
#tpl system cannot handle.
IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT)
IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread")
SET(USE_THREADS TRUE)
ENDIF()
ENDIF()
ENDIF()
IF(USE_THREADS)
SET(TPL_Pthread_INCLUDE_DIRS "")
SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}")
SET(TPL_Pthread_LIBRARY_DIRS "")
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(Pthread)
ELSE()
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread
REQUIRED_HEADERS pthread.h
REQUIRED_LIBS_NAMES pthread
)
ENDIF()

View File

@ -0,0 +1,70 @@
# @HEADER
# ************************************************************************
#
# Trilinos: An Object-Oriented Solver Framework
# Copyright (2001) Sandia Corporation
#
#
# Copyright (2001) Sandia Corporation. Under the terms of Contract
# DE-AC04-94AL85000, there is a non-exclusive license for use of this
# work by or on behalf of the U.S. Government. Export of this program
# may require a license from the United States Government.
#
# 1. Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the Corporation nor the names of the
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# NOTICE: The United States Government is granted for itself and others
# acting on its behalf a paid-up, nonexclusive, irrevocable worldwide
# license in this data to reproduce, prepare derivative works, and
# perform publicly and display publicly. Beginning five (5) years from
# July 25, 2001, the United States Government is granted for itself and
# others acting on its behalf a paid-up, nonexclusive, irrevocable
# worldwide license in this data to reproduce, prepare derivative works,
# distribute copies to the public, perform publicly and display
# publicly, and to permit others to do so.
#
# NEITHER THE UNITED STATES GOVERNMENT, NOR THE UNITED STATES DEPARTMENT
# OF ENERGY, NOR SANDIA CORPORATION, NOR ANY OF THEIR EMPLOYEES, MAKES
# ANY WARRANTY, EXPRESS OR IMPLIED, OR ASSUMES ANY LEGAL LIABILITY OR
# RESPONSIBILITY FOR THE ACCURACY, COMPLETENESS, OR USEFULNESS OF ANY
# INFORMATION, APPARATUS, PRODUCT, OR PROCESS DISCLOSED, OR REPRESENTS
# THAT ITS USE WOULD NOT INFRINGE PRIVATELY OWNED RIGHTS.
#
# ************************************************************************
# @HEADER
#-----------------------------------------------------------------------------
# Hardware locality detection and control library.
#
# Acquisition information:
# Date checked: July 2014
# Checked by: H. Carter Edwards <hcedwar AT sandia.gov>
# Source: https://code.google.com/p/qthreads
#
TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( QTHREAD
REQUIRED_HEADERS qthread.h
REQUIRED_LIBS_NAMES "qthread"
)

View File

@ -0,0 +1,485 @@
INCLUDE(CMakeParseArguments)
INCLUDE(CTest)
FUNCTION(ASSERT_DEFINED VARS)
FOREACH(VAR ${VARS})
IF(NOT DEFINED ${VAR})
MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!")
ENDIF()
ENDFOREACH()
ENDFUNCTION()
MACRO(GLOBAL_SET VARNAME)
SET(${VARNAME} ${ARGN} CACHE INTERNAL "")
ENDMACRO()
MACRO(PREPEND_GLOBAL_SET VARNAME)
ASSERT_DEFINED(${VARNAME})
GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}})
ENDMACRO()
FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME)
ASSERT_DEFINED(${VARNAME})
IF (${VARNAME})
SET(TMP ${${VARNAME}})
LIST(REMOVE_DUPLICATES TMP)
GLOBAL_SET(${VARNAME} ${TMP})
ENDIF()
ENDFUNCTION()
MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE)
MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'")
SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" )
IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "")
IF(${USER_OPTION_NAME})
GLOBAL_SET(${MACRO_DEFINE_NAME} ON)
ELSE()
GLOBAL_SET(${MACRO_DEFINE_NAME} OFF)
ENDIF()
ENDIF()
ENDMACRO()
FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE)
# Configure the file
CONFIGURE_FILE(
${PACKAGE_SOURCE_DIR}/cmake/${PACKAGE_NAME_CONFIG_FILE}.in
${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME_CONFIG_FILE}
)
ENDFUNCTION()
MACRO(TRIBITS_ADD_DEBUG_OPTION)
TRIBITS_ADD_OPTION_AND_DEFINE(
${PROJECT_NAME}_ENABLE_DEBUG
HAVE_${PROJECT_NAME_UC}_DEBUG
"Enable a host of runtime debug checking."
OFF
)
ENDMACRO()
MACRO(TRIBITS_ADD_TEST_DIRECTORIES)
FOREACH(TEST_DIR ${ARGN})
ADD_SUBDIRECTORY(${TEST_DIR})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_ADD_EXAMPLE_DIRECTORIES)
IF(${PACKAGE_NAME}_ENABLE_EXAMPLES OR ${PARENT_PACKAGE_NAME}_ENABLE_EXAMPLES)
FOREACH(EXAMPLE_DIR ${ARGN})
ADD_SUBDIRECTORY(${EXAMPLE_DIR})
ENDFOREACH()
ENDIF()
ENDMACRO()
MACRO(TARGET_TRANSFER_PROPERTY TARGET_NAME PROP_IN PROP_OUT)
SET(PROP_VALUES)
FOREACH(TARGET_X ${ARGN})
LIST(APPEND PROP_VALUES "$<TARGET_PROPERTY:${TARGET_X},${PROP_IN}>")
ENDFOREACH()
SET_TARGET_PROPERTIES(${TARGET_NAME} PROPERTIES ${PROP_OUT} "${PROP_VALUES}")
ENDMACRO()
MACRO(ADD_INTERFACE_LIBRARY LIB_NAME)
FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "")
ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp)
SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE)
ENDMACRO()
# Older versions of cmake does not make include directories transitive
MACRO(TARGET_LINK_AND_INCLUDE_LIBRARIES TARGET_NAME)
TARGET_LINK_LIBRARIES(${TARGET_NAME} LINK_PUBLIC ${ARGN})
FOREACH(DEP_LIB ${ARGN})
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INTERFACE_INCLUDE_DIRECTORIES>)
TARGET_INCLUDE_DIRECTORIES(${TARGET_NAME} PUBLIC $<TARGET_PROPERTY:${DEP_LIB},INCLUDE_DIRECTORIES>)
ENDFOREACH()
ENDMACRO()
FUNCTION(TRIBITS_ADD_LIBRARY LIBRARY_NAME)
SET(options STATIC SHARED TESTONLY NO_INSTALL_LIB_OR_HEADERS CUDALIBRARY)
SET(oneValueArgs)
SET(multiValueArgs HEADERS HEADERS_INSTALL_SUBDIR NOINSTALLHEADERS SOURCES DEPLIBS IMPORTEDLIBS DEFINES ADDED_LIB_TARGET_NAME_OUT)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
IF(PARSE_HEADERS)
LIST(REMOVE_DUPLICATES PARSE_HEADERS)
ENDIF()
IF(PARSE_SOURCES)
LIST(REMOVE_DUPLICATES PARSE_SOURCES)
ENDIF()
# Local variable to hold all of the libraries that will be directly linked
# to this library.
SET(LINK_LIBS ${${PACKAGE_NAME}_DEPS})
# Add dependent libraries passed directly in
IF (PARSE_IMPORTEDLIBS)
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
ENDIF()
IF (PARSE_DEPLIBS)
LIST(APPEND LINK_LIBS ${PARSE_DEPLIBS})
ENDIF()
# Add the library and all the dependencies
IF (PARSE_DEFINES)
ADD_DEFINITIONS(${PARSE_DEFINES})
ENDIF()
IF (PARSE_STATIC)
SET(STATIC_KEYWORD "STATIC")
ELSE()
SET(STATIC_KEYWORD)
ENDIF()
IF (PARSE_SHARED)
SET(SHARED_KEYWORD "SHARED")
ELSE()
SET(SHARED_KEYWORD)
ENDIF()
IF (PARSE_TESTONLY)
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
ELSE()
SET(EXCLUDE_FROM_ALL_KEYWORD)
ENDIF()
IF (NOT PARSE_CUDALIBRARY)
ADD_LIBRARY(
${LIBRARY_NAME}
${STATIC_KEYWORD}
${SHARED_KEYWORD}
${EXCLUDE_FROM_ALL_KEYWORD}
${PARSE_HEADERS}
${PARSE_NOINSTALLHEADERS}
${PARSE_SOURCES}
)
ELSE()
CUDA_ADD_LIBRARY(
${LIBRARY_NAME}
${PARSE_HEADERS}
${PARSE_NOINSTALLHEADERS}
${PARSE_SOURCES}
)
ENDIF()
TARGET_LINK_AND_INCLUDE_LIBRARIES(${LIBRARY_NAME} ${LINK_LIBS})
IF (NOT PARSE_TESTONLY OR PARSE_NO_INSTALL_LIB_OR_HEADERS)
INSTALL(
TARGETS ${LIBRARY_NAME}
EXPORT ${PROJECT_NAME}
RUNTIME DESTINATION bin
LIBRARY DESTINATION lib
ARCHIVE DESTINATION lib
COMPONENT ${PACKAGE_NAME}
)
INSTALL(
FILES ${PARSE_HEADERS}
EXPORT ${PROJECT_NAME}
DESTINATION include
COMPONENT ${PACKAGE_NAME}
)
INSTALL(
DIRECTORY ${PARSE_HEADERS_INSTALL_SUBDIR}
EXPORT ${PROJECT_NAME}
DESTINATION include
COMPONENT ${PACKAGE_NAME}
)
ENDIF()
IF (NOT PARSE_TESTONLY)
PREPEND_GLOBAL_SET(${PACKAGE_NAME}_LIBS ${LIBRARY_NAME})
REMOVE_GLOBAL_DUPLICATES(${PACKAGE_NAME}_LIBS)
ENDIF()
ENDFUNCTION()
FUNCTION(TRIBITS_ADD_EXECUTABLE EXE_NAME)
SET(options NOEXEPREFIX NOEXESUFFIX ADD_DIR_TO_NAME INSTALLABLE TESTONLY)
SET(oneValueArgs ADDED_EXE_TARGET_NAME_OUT)
SET(multiValueArgs SOURCES CATEGORIES HOST XHOST HOSTTYPE XHOSTTYPE DIRECTORY TESTONLYLIBS IMPORTEDLIBS DEPLIBS COMM LINKER_LANGUAGE TARGET_DEFINES DEFINES)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
IF (PARSE_TARGET_DEFINES)
TARGET_COMPILE_DEFINITIONS(${EXE_NAME} PUBLIC ${PARSE_TARGET_DEFINES})
ENDIF()
SET(LINK_LIBS PACKAGE_${PACKAGE_NAME})
IF (PARSE_TESTONLYLIBS)
LIST(APPEND LINK_LIBS ${PARSE_TESTONLYLIBS})
ENDIF()
IF (PARSE_IMPORTEDLIBS)
LIST(APPEND LINK_LIBS ${PARSE_IMPORTEDLIBS})
ENDIF()
SET (EXE_SOURCES)
IF(PARSE_DIRECTORY)
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
IF(IS_ABSOLUTE ${SOURCE_FILE})
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
ELSE()
SET (EXE_SOURCES ${EXE_SOURCES} ${PARSE_DIRECTORY}/${SOURCE_FILE})
ENDIF()
ENDFOREACH( )
ELSE()
FOREACH( SOURCE_FILE ${PARSE_SOURCES} )
SET (EXE_SOURCES ${EXE_SOURCES} ${SOURCE_FILE})
ENDFOREACH( )
ENDIF()
SET(EXE_BINARY_NAME ${EXE_NAME})
IF(DEFINED PACKAGE_NAME AND NOT PARSE_NOEXEPREFIX)
SET(EXE_BINARY_NAME ${PACKAGE_NAME}_${EXE_BINARY_NAME})
ENDIF()
IF (PARSE_TESTONLY)
SET(EXCLUDE_FROM_ALL_KEYWORD "EXCLUDE_FROM_ALL")
ELSE()
SET(EXCLUDE_FROM_ALL_KEYWORD)
ENDIF()
ADD_EXECUTABLE(${EXE_BINARY_NAME} ${EXCLUDE_FROM_ALL_KEYWORD} ${EXE_SOURCES})
TARGET_LINK_AND_INCLUDE_LIBRARIES(${EXE_BINARY_NAME} ${LINK_LIBS})
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${EXE_BINARY_NAME} PARENT_SCOPE)
ENDIF()
IF(PARSE_INSTALLABLE)
INSTALL(
TARGETS ${EXE_BINARY_NAME}
EXPORT ${PROJECT_NAME}
DESTINATION bin
)
ENDIF()
ENDFUNCTION()
ADD_CUSTOM_TARGET(check COMMAND ${CMAKE_CTEST_COMMAND} -VV -C ${CMAKE_CFG_INTDIR})
FUNCTION(TRIBITS_ADD_EXECUTABLE_AND_TEST EXE_NAME)
SET(options STANDARD_PASS_OUTPUT WILL_FAIL)
SET(oneValueArgs PASS_REGULAR_EXPRESSION FAIL_REGULAR_EXPRESSION ENVIRONMENT TIMEOUT CATEGORIES ADDED_TESTS_NAMES_OUT ADDED_EXE_TARGET_NAME_OUT)
SET(multiValueArgs)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
TRIBITS_ADD_EXECUTABLE(${EXE_NAME} TESTONLY ADDED_EXE_TARGET_NAME_OUT TEST_NAME ${PARSE_UNPARSED_ARGUMENTS})
IF(WIN32)
ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${TEST_NAME}${CMAKE_EXECUTABLE_SUFFIX})
ELSE()
ADD_TEST(NAME ${TEST_NAME} COMMAND ${TEST_NAME})
ENDIF()
ADD_DEPENDENCIES(check ${TEST_NAME})
IF(PARSE_FAIL_REGULAR_EXPRESSION)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${PARSE_FAIL_REGULAR_EXPRESSION})
ENDIF()
IF(PARSE_PASS_REGULAR_EXPRESSION)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${PARSE_PASS_REGULAR_EXPRESSION})
ENDIF()
IF(PARSE_WILL_FAIL)
SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${PARSE_WILL_FAIL})
ENDIF()
IF(PARSE_ADDED_TESTS_NAMES_OUT)
SET(${PARSE_ADDED_TESTS_NAMES_OUT} ${TEST_NAME} PARENT_SCOPE)
ENDIF()
IF(PARSE_ADDED_EXE_TARGET_NAME_OUT)
SET(${PARSE_ADDED_EXE_TARGET_NAME_OUT} ${TEST_NAME} PARENT_SCOPE)
ENDIF()
ENDFUNCTION()
MACRO(TIBITS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME)
ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME})
TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES})
TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS})
ENDMACRO()
FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME)
SET(options MUST_FIND_ALL_LIBS MUST_FIND_ALL_HEADERS NO_PRINT_ENABLE_SUCCESS_FAIL)
SET(oneValueArgs)
SET(multiValueArgs REQUIRED_HEADERS REQUIRED_LIBS_NAMES)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE)
IF (PARSE_REQUIRED_LIBS_NAMES)
FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES})
IF(NOT TPL_${TPL_NAME}_LIBRARIES)
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
ENDIF()
ENDIF()
IF (PARSE_REQUIRED_HEADERS)
FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS})
IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS)
SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE)
ENDIF()
ENDIF()
IF (_${TPL_NAME}_ENABLE_SUCCESS)
TIBITS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME})
ENDIF()
ENDFUNCTION()
MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE)
GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE)
INCLUDE("${TPL_FILE}")
IF(TARGET TPL_LIB_${TPL_NAME})
MESSAGE(STATUS "Found tpl library: ${TPL_NAME}")
SET(TPL_ENABLE_${TPL_NAME} TRUE)
ELSE()
MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}")
SET(TPL_ENABLE_${TPL_NAME} FALSE)
ENDIF()
ENDMACRO()
MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE)
IF(TYPE STREQUAL "REQUIRED")
SET(REQUIRED TRUE)
ELSE()
SET(REQUIRED FALSE)
ENDIF()
IF(TARGET ${TARGET_NAME})
PREPEND_GLOBAL_SET(${VARNAME} ${TARGET_NAME})
ELSE()
IF(REQUIRED)
MESSAGE(FATAL_ERROR "Missing dependency ${TARGET_NAME}")
ENDIF()
ENDIF()
ENDMACRO()
MACRO(TRIBITS_APPEND_PACKAGE_DEPS DEP_LIST TYPE)
FOREACH(DEP ${ARGN})
PREPEND_GLOBAL_SET(${DEP_LIST} PACKAGE_${DEP})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_APPEND_TPLS_DEPS DEP_LIST TYPE)
FOREACH(DEP ${ARGN})
PREPEND_TARGET_SET(${DEP_LIST} TPL_LIB_${DEP} ${TYPE})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_ENABLE_TPLS)
FOREACH(TPL ${ARGN})
IF(TARGET ${TPL})
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} TRUE)
ELSE()
GLOBAL_SET(${PACKAGE_NAME}_ENABLE_${TPL} FALSE)
ENDIF()
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_PACKAGE_DEFINE_DEPENDENCIES)
SET(options)
SET(oneValueArgs)
SET(multiValueArgs
LIB_REQUIRED_PACKAGES
LIB_OPTIONAL_PACKAGES
TEST_REQUIRED_PACKAGES
TEST_OPTIONAL_PACKAGES
LIB_REQUIRED_TPLS
LIB_OPTIONAL_TPLS
TEST_REQUIRED_TPLS
TEST_OPTIONAL_TPLS
REGRESSION_EMAIL_LIST
SUBPACKAGES_DIRS_CLASSIFICATIONS_OPTREQS
)
CMAKE_PARSE_ARGUMENTS(PARSE "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
GLOBAL_SET(${PACKAGE_NAME}_DEPS "")
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_PACKAGES})
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_PACKAGES})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS REQUIRED ${PARSE_LIB_REQUIRED_TPLS})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_DEPS OPTIONAL ${PARSE_LIB_OPTIONAL_TPLS})
GLOBAL_SET(${PACKAGE_NAME}_TEST_DEPS "")
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_PACKAGES})
TRIBITS_APPEND_PACKAGE_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_PACKAGES})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS REQUIRED ${PARSE_TEST_REQUIRED_TPLS})
TRIBITS_APPEND_TPLS_DEPS(${PACKAGE_NAME}_TEST_DEPS OPTIONAL ${PARSE_TEST_OPTIONAL_TPLS})
TRIBITS_ENABLE_TPLS(${PARSE_LIB_REQUIRED_TPLS} ${PARSE_LIB_OPTIONAL_TPLS} ${PARSE_TEST_REQUIRED_TPLS} ${PARSE_TEST_OPTIONAL_TPLS})
ENDMACRO()
MACRO(TRIBITS_SUBPACKAGE NAME)
SET(PACKAGE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME})
SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME})
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME})
GLOBAL_SET(${PACKAGE_NAME}_LIBS "")
INCLUDE(${PACKAGE_SOURCE_DIR}/cmake/Dependencies.cmake)
ENDMACRO(TRIBITS_SUBPACKAGE)
MACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
TARGET_LINK_AND_INCLUDE_LIBRARIES(PACKAGE_${PACKAGE_NAME} ${${PACKAGE_NAME}_LIBS})
ENDMACRO(TRIBITS_SUBPACKAGE_POSTPROCESS)
MACRO(TRIBITS_PACKAGE_DECL NAME)
PROJECT(${NAME})
STRING(TOUPPER ${PROJECT_NAME} PROJECT_NAME_UC)
SET(PACKAGE_NAME ${PROJECT_NAME})
STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC)
SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps")
FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake")
FOREACH(TPL_FILE ${TPLS_FILES})
TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE})
ENDFOREACH()
ENDMACRO()
MACRO(TRIBITS_PROCESS_SUBPACKAGES)
FILE(GLOB SUBPACKAGES RELATIVE ${CMAKE_SOURCE_DIR} */cmake/Dependencies.cmake)
FOREACH(SUBPACKAGE ${SUBPACKAGES})
GET_FILENAME_COMPONENT(SUBPACKAGE_CMAKE ${SUBPACKAGE} DIRECTORY)
GET_FILENAME_COMPONENT(SUBPACKAGE_DIR ${SUBPACKAGE_CMAKE} DIRECTORY)
ADD_SUBDIRECTORY(${SUBPACKAGE_DIR})
ENDFOREACH()
ENDMACRO(TRIBITS_PROCESS_SUBPACKAGES)
MACRO(TRIBITS_PACKAGE_DEF)
ENDMACRO(TRIBITS_PACKAGE_DEF)
MACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
ENDMACRO(TRIBITS_EXCLUDE_AUTOTOOLS_FILES)
MACRO(TRIBITS_EXCLUDE_FILES)
ENDMACRO(TRIBITS_EXCLUDE_FILES)
MACRO(TRIBITS_PACKAGE_POSTPROCESS)
ENDMACRO(TRIBITS_PACKAGE_POSTPROCESS)

View File

@ -0,0 +1,153 @@
// -------------------------------------------------------------------------------- //
The following steps are for workstations/servers with the SEMS environment installed.
// -------------------------------------------------------------------------------- //
Summary:
- Step 1: Rigorous testing of Kokkos' develop branch for each backend (Serial, OpenMP, Threads, Cuda) with all supported compilers.
- Step 2: Snapshot Kokkos' develop branch into current Trilinos develop branch.
- Step 3: Build and test Trilinos with combinations of compilers, types, backends.
- Step 4: Promote Kokkos develop branch to master if the snapshot does not cause any new tests to fail; else track/fix causes of new failures.
- Step 5: Snapshot Kokkos tagged master branch into Trilinos and push Trilinos.
// -------------------------------------------------------------------------------- //
// -------------------------------------------------------------------------------- //
Step 1:
1.1. Update kokkos develop branch (NOT a fork)
(From kokkos directory):
git fetch --all
git checkout develop
git reset --hard origin/develop
1.2. Create a testing directory - here the directory is created within the kokkos directory
mkdir testing
cd testing
1.3. Run the test_all_sandia script; various compiler and build-list options can be specified
../config/test_all_sandia
1.4 Clean repository of untracked files
cd ../
git clean -df
// -------------------------------------------------------------------------------- //
Step 2:
2.1 Update Trilinos develop branch
(From Trilinos directory):
git checkout develop
git fetch --all
git reset --hard origin/develop
git clean -df
2.2 Snapshot Kokkos into Trilinos - this requires python/2.7.9 and that both Trilinos and Kokkos be clean - no untracked or modified files
module load python/2.7.9
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
// -------------------------------------------------------------------------------- //
Step 3:
3.1. Build and test Trilinos with 3 different configurations; a configure-all script is provided in Trilinos and should be modified to test each of the following 3 configurations with appropriate environment variable(s):
- GCC/4.7.2-OpenMP/Complex
Run tests with the following environment variable:
export OMP_NUM_THREADS=2
- Intel/15.0.2-Serial/NoComplex
- GCC/4.8.4/CUDA/7.5.18-Cuda/Serial/NoComplex
Run tests with the following environment variables:
export CUDA_LAUNCH_BLOCKING=1
export CUDA_MANAGED_FORCE_DEVICE_ALLOC=1
mkdir Build
cd Build
cp TRILINOS_PATH/sampleScripts/Sandia-SEMS/configure-all ./
** Set the path to Trilinos appropriately within the configure-all script **
source $SEMS_MODULE_ROOT/utils/sems-modules-init.sh kokkos
source configure-all
make -k (-k means "keep going" to get past build errors; -j12 can also be specified to build with 12 threads, for example)
ctest
3.2. Compare the failed test output to the test output on the dashboard ( testing.sandia.gov/cdash select Trilinos ); investigate and fix problems if new tests fail after the Kokkos snapshot
// -------------------------------------------------------------------------------- //
Step 4:
4.1. Once all Trilinos tests pass promote Kokkos develop branch to master on Github
- DO NOT fast-forward the merge!!!!
(From kokkos directory):
git checkout master
git fetch --all
# Ensure we are on the current origin/master
git reset --hard origin/master
git merge --no-ff origin/develop
4.2. Update the tag in kokkos/config/master_history.txt
Tag description: MajorNumber.MinorNumber.WeeksSinceMinorNumberUpdate
Tag format: #.#.##
# Prepend master_history.txt with
# tag: #.#.##
# date: mm/dd/yyyy
# master: sha1
# develop: sha1
# -----------------------
git commit --amend -a
git tag -a #.#.##
tag: #.#.##
date: mm/dd/yyyy
master: sha1
develop: sha1
git push --follow-tags origin master
// -------------------------------------------------------------------------------- //
Step 5:
5.1. Make sure Trilinos is up-to-date - chances are other changes have been committed since the integration testing process began. If a substantial change has occurred that may be affected by the snapshot the testing procedure may need to be repeated
(From Trilinos directory):
git checkout develop
git fetch --all
git reset --hard origin/develop
git clean -df
5.2. Snapshot Kokkos master branch into Trilinos
(From kokkos directory):
git fetch --all
git checkout tags/#.#.##
git clean -df
python KOKKOS_PATH/config/snapshot.py KOKKOS_PATH TRILINOS_PATH/packages
5.3. Push the updated develop branch of Trilinos to Github - congratulations!!!
(From Trilinos directory):
git push
// -------------------------------------------------------------------------------- //

View File

@ -0,0 +1,3 @@
tag: 2.01.00 date: 07:21:2016 master: xxxxxxxx develop: fa6dfcc4
tag: 2.01.06 date: 09:02:2016 master: 9afaa87f develop: 555f1a3a

View File

@ -1,17 +1,12 @@
#!/bin/bash #!/bin/bash
# #
# This shell script (nvcc_wrapper) wraps both the host compiler and # This shell script (nvcc_wrapper) wraps both the host compiler and
# NVCC, if you are building Trilinos with CUDA enabled. The script # NVCC, if you are building legacy C or C++ code with CUDA enabled.
# remedies some differences between the interface of NVCC and that of # The script remedies some differences between the interface of NVCC
# the host compiler, in particular for linking. It also means that # and that of the host compiler, in particular for linking.
# Trilinos doesn't need separate .cu files; it can just use .cpp # It also means that a legacy code doesn't need separate .cu files;
# files. # it can just use .cpp files.
# #
# Hopefully, at some point, NVIDIA may fix NVCC so as to make this
# script obsolete. For now, this script exists and if you want to
# build Trilinos with CUDA enabled, you must use this script as your
# compiler.
# Default settings: change those according to your machine. For # Default settings: change those according to your machine. For
# example, you may have have two different wrappers with either icpc # example, you may have have two different wrappers with either icpc
# or g++ as their back-end compiler. The defaults can be overwritten # or g++ as their back-end compiler. The defaults can be overwritten
@ -53,6 +48,10 @@ object_files=""
# Link objects for the host linker only # Link objects for the host linker only
object_files_xlinker="" object_files_xlinker=""
# Shared libraries with version numbers are not handled correctly by NVCC
shared_versioned_libraries_host=""
shared_versioned_libraries=""
# Does the User set the architecture # Does the User set the architecture
arch_set=0 arch_set=0
@ -76,6 +75,9 @@ first_xcompiler_arg=1
temp_dir=${TMPDIR:-/tmp} temp_dir=${TMPDIR:-/tmp}
# Check if we have an optimization argument already
optimization_applied=0
#echo "Arguments: $# $@" #echo "Arguments: $# $@"
while [ $# -gt 0 ] while [ $# -gt 0 ]
@ -97,8 +99,17 @@ do
*.cpp|*.cxx|*.cc|*.C|*.c++|*.cu) *.cpp|*.cxx|*.cc|*.C|*.c++|*.cu)
cpp_files="$cpp_files $1" cpp_files="$cpp_files $1"
;; ;;
# Ensure we only have one optimization flag because NVCC doesn't allow muliple
-O*)
if [ $optimization_applied -eq 1 ]; then
echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-O*), only the first is used because nvcc can only accept a single optimization setting."
else
shared_args="$shared_args $1"
optimization_applied=1
fi
;;
#Handle shared args (valid for both nvcc and the host compiler) #Handle shared args (valid for both nvcc and the host compiler)
-O*|-D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared)
shared_args="$shared_args $1" shared_args="$shared_args $1"
;; ;;
#Handle shared args that have an argument #Handle shared args that have an argument
@ -107,7 +118,7 @@ do
shift shift
;; ;;
#Handle known nvcc args #Handle known nvcc args
-gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage) -gencode*|--dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|--resource-usage|-Xptxas*)
cuda_args="$cuda_args $1" cuda_args="$cuda_args $1"
;; ;;
#Handle known nvcc args that have an argument #Handle known nvcc args that have an argument
@ -175,10 +186,15 @@ do
object_files_xlinker="$object_files_xlinker -Xlinker $1" object_files_xlinker="$object_files_xlinker -Xlinker $1"
;; ;;
#Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking
*.so.*|*.dylib) *.dylib)
object_files="$object_files -Xlinker $1" object_files="$object_files -Xlinker $1"
object_files_xlinker="$object_files_xlinker -Xlinker $1" object_files_xlinker="$object_files_xlinker -Xlinker $1"
;; ;;
#Handle shared libraries with *.so.* names which nvcc can't do.
*.so.*)
shared_versioned_libraries_host="$shared_versioned_libraries_host $1"
shared_versioned_libraries="$shared_versioned_libraries -Xlinker $1"
;;
#All other args are sent to the host compiler #All other args are sent to the host compiler
*) *)
if [ $first_xcompiler_arg -eq 1 ]; then if [ $first_xcompiler_arg -eq 1 ]; then
@ -204,13 +220,13 @@ if [ $arch_set -ne 1 ]; then
fi fi
#Compose compilation command #Compose compilation command
nvcc_command="nvcc $cuda_args $shared_args $xlinker_args" nvcc_command="nvcc $cuda_args $shared_args $xlinker_args $shared_versioned_libraries"
if [ $first_xcompiler_arg -eq 0 ]; then if [ $first_xcompiler_arg -eq 0 ]; then
nvcc_command="$nvcc_command -Xcompiler $xcompiler_args" nvcc_command="$nvcc_command -Xcompiler $xcompiler_args"
fi fi
#Compose host only command #Compose host only command
host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args" host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host"
#nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING'
if [ $replace_pragma_ident -eq 1 ]; then if [ $replace_pragma_ident -eq 1 ]; then

View File

@ -6,34 +6,36 @@
set -o pipefail set -o pipefail
# Determine current machine
MACHINE=""
HOSTNAME=$(hostname)
if [[ "$HOSTNAME" =~ (white|ride).* ]]; then
MACHINE=white
elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then
MACHINE=bowman
elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name
MACHINE=shepard
elif [ ! -z "$SEMS_MODULEFILES_ROOT" ]; then
MACHINE=sems
else
echo "Unrecognized machine" >&2
exit 1
fi
GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" GCC_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
IBM_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial" INTEL_BUILD_LIST="OpenMP,Pthread,Serial,OpenMP_Serial,Pthread_Serial"
CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial" CLANG_BUILD_LIST="Pthread,Serial,Pthread_Serial"
CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial" CUDA_BUILD_LIST="Cuda_OpenMP,Cuda_Pthread,Cuda_Serial"
GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized" GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wignored-qualifiers,-Wempty-body,-Wclobbered,-Wuninitialized"
IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized"
CUDA_WARNING_FLAGS="" CUDA_WARNING_FLAGS=""
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base" # Default. Machine specific can override
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
#
# Handle arguments
#
DEBUG=False DEBUG=False
ARGS="" ARGS=""
CUSTOM_BUILD_LIST="" CUSTOM_BUILD_LIST=""
@ -41,6 +43,107 @@ DRYRUN=False
BUILD_ONLY=False BUILD_ONLY=False
declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3 declare -i NUM_JOBS_TO_RUN_IN_PARALLEL=3
TEST_SCRIPT=False TEST_SCRIPT=False
SKIP_HWLOC=False
ARCH_FLAG=""
#
# Machine specific config
#
if [ "$MACHINE" = "sems" ]; then
source /projects/modulefiles/utils/sems-modules-init.sh
source /projects/modulefiles/utils/kokkos-modules-init.sh
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>/base,hwloc/1.10.1/<COMPILER_NAME>/<COMPILER_VERSION>/base"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.7.2/base"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
elif [ "$MACHINE" = "white" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>"
IBM_MODULE_LIST="<COMPILER_NAME>/xl/<COMPILER_VERSION>"
CUDA_MODULE_LIST="<COMPILER_NAME>/<COMPILER_VERSION>,gcc/4.9.2"
# Don't do pthread on white
GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.9.2 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.3.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS"
)
ARCH_FLAG="--arch=Power8"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "bowman" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=KNL"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
elif [ "$MACHINE" = "shepard" ]; then
source /etc/profile.d/modules.sh
SKIP_HWLOC=True
export SLURM_TASKS_PER_NODE=32
BASE_MODULE_LIST="<COMPILER_NAME>/compilers/<COMPILER_VERSION>"
OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial"
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/17.0.064 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
)
ARCH_FLAG="--arch=HSW"
NUM_JOBS_TO_RUN_IN_PARALLEL=8
else
echo "Unhandled machine $MACHINE" >&2
exit 1
fi
export OMP_NUM_THREADS=4
declare -i NUM_RESULTS_TO_KEEP=7
RESULT_ROOT_PREFIX=TestAll
SCRIPT_KOKKOS_ROOT=$( cd "$( dirname "$0" )" && cd .. && pwd )
#
# Handle arguments
#
while [[ $# > 0 ]] while [[ $# > 0 ]]
do do
@ -61,6 +164,9 @@ BUILD_ONLY=True
--test-script*) --test-script*)
TEST_SCRIPT=True TEST_SCRIPT=True
;; ;;
--skip-hwloc*)
SKIP_HWLOC=True
;;
--num*) --num*)
NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}" NUM_JOBS_TO_RUN_IN_PARALLEL="${key#*=}"
;; ;;
@ -73,6 +179,7 @@ echo "--kokkos-path=/Path/To/Kokkos: Path to the Kokkos root directory"
echo " Defaults to root repo containing this script" echo " Defaults to root repo containing this script"
echo "--debug: Run tests in debug. Defaults to False" echo "--debug: Run tests in debug. Defaults to False"
echo "--test-script: Test this script, not Kokkos" echo "--test-script: Test this script, not Kokkos"
echo "--skip-hwloc: Do not do hwloc tests"
echo "--num=N: Number of jobs to run in parallel " echo "--num=N: Number of jobs to run in parallel "
echo "--dry-run: Just print what would be executed" echo "--dry-run: Just print what would be executed"
echo "--build-only: Just do builds, don't run anything" echo "--build-only: Just do builds, don't run anything"
@ -82,21 +189,16 @@ echo " Valid items:"
echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial" echo " OpenMP, Pthread, Serial, OpenMP_Serial, Pthread_Serial"
echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial" echo " Cuda_OpenMP, Cuda_Pthread, Cuda_Serial"
echo "" echo ""
echo "ARGS: list of expressions matching compilers to test" echo "ARGS: list of expressions matching compilers to test"
echo " supported compilers" echo " supported compilers sems"
echo " gcc/4.7.2" for COMPILER_DATA in "${COMPILERS[@]}"; do
echo " gcc/4.8.4" ARR=($COMPILER_DATA)
echo " gcc/4.9.2" COMPILER=${ARR[0]}
echo " gcc/5.1.0" echo " $COMPILER"
echo " intel/14.0.4" done
echo " intel/15.0.2"
echo " intel/16.0.1"
echo " clang/3.5.2"
echo " clang/3.6.1"
echo " cuda/6.5.14"
echo " cuda/7.0.28"
echo " cuda/7.5.18"
echo "" echo ""
echo "Examples:" echo "Examples:"
echo " Run all tests" echo " Run all tests"
echo " % test_all_sandia" echo " % test_all_sandia"
@ -147,21 +249,6 @@ if [ -z "$ARGS" ]; then
ARGS='?' ARGS='?'
fi fi
# Format: (compiler module-list build-list exe-name warning-flag)
COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"gcc/5.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS"
"intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS"
"clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS"
"cuda/6.5.14 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
"cuda/7.5.18 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/config/nvcc_wrapper $CUDA_WARNING_FLAGS"
)
# Process args to figure out which compilers to test # Process args to figure out which compilers to test
COMPILERS_TO_TEST="" COMPILERS_TO_TEST=""
for ARG in $ARGS; do for ARG in $ARGS; do
@ -240,18 +327,19 @@ run_cmd() {
fi fi
} }
# report_and_log_test_results <SUCCESS> <DESC> <PHASE> # report_and_log_test_results <SUCCESS> <DESC> <COMMENT>
report_and_log_test_result() { report_and_log_test_result() {
# Use sane var names # Use sane var names
local success=$1; local desc=$2; local phase=$3; local success=$1; local desc=$2; local comment=$3;
if [ "$success" = "0" ]; then if [ "$success" = "0" ]; then
echo " PASSED $desc" echo " PASSED $desc"
touch $PASSED_DIR/$desc echo $comment > $PASSED_DIR/$desc
else else
# For failures, comment should be the name of the phase that failed
echo " FAILED $desc" >&2 echo " FAILED $desc" >&2
echo $phase > $FAILED_DIR/$desc echo $comment > $FAILED_DIR/$desc
cat ${desc}.${phase}.log cat ${desc}.${comment}.log
fi fi
} }
@ -309,6 +397,8 @@ single_build_and_test() {
echo " Starting job $desc" echo " Starting job $desc"
local comment="no_comment"
if [ "$TEST_SCRIPT" = "True" ]; then if [ "$TEST_SCRIPT" = "True" ]; then
local rand=$[ 1 + $[ RANDOM % 10 ]] local rand=$[ 1 + $[ RANDOM % 10 ]]
sleep $rand sleep $rand
@ -316,14 +406,19 @@ single_build_and_test() {
run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; }
fi fi
else else
run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; }
local -i build_start_time=$(date +%s)
run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } run_cmd make build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; }
local -i build_end_time=$(date +%s)
comment="build_time=$(($build_end_time-$build_start_time))"
if [[ "$BUILD_ONLY" == False ]]; then if [[ "$BUILD_ONLY" == False ]]; then
run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; } run_cmd make test >& ${desc}.test.log || { report_and_log_test_result 1 ${desc} test && return 0; }
local -i run_end_time=$(date +%s)
comment="$comment run_time=$(($run_end_time-$build_end_time))"
fi fi
fi fi
report_and_log_test_result 0 $desc report_and_log_test_result 0 $desc "$comment"
return 0 return 0
} }
@ -374,7 +469,7 @@ build_and_test_all() {
run_in_background $compiler $build $BUILD_TYPE run_in_background $compiler $build $BUILD_TYPE
# If not cuda, do a hwloc test too # If not cuda, do a hwloc test too
if [[ "$compiler" != cuda* ]]; then if [[ "$compiler" != cuda* && "$SKIP_HWLOC" == False ]]; then
run_in_background $compiler $build "hwloc-$BUILD_TYPE" run_in_background $compiler $build "hwloc-$BUILD_TYPE"
fi fi
done done
@ -401,7 +496,11 @@ wait_summarize_and_exit() {
echo "PASSED TESTS" echo "PASSED TESTS"
echo "#######################################################" echo "#######################################################"
\ls -1 $PASSED_DIR | sort local passed_test
for passed_test in $(\ls -1 $PASSED_DIR | sort)
do
echo $passed_test $(cat $PASSED_DIR/$passed_test)
done
echo "#######################################################" echo "#######################################################"
echo "FAILED TESTS" echo "FAILED TESTS"
@ -409,7 +508,7 @@ wait_summarize_and_exit() {
local failed_test local failed_test
local -i rv=0 local -i rv=0
for failed_test in $(\ls -1 $FAILED_DIR) for failed_test in $(\ls -1 $FAILED_DIR | sort)
do do
echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)" echo $failed_test "("$(cat $FAILED_DIR/$failed_test)" failed)"
rv=$rv+1 rv=$rv+1

View File

@ -16,11 +16,22 @@ IF(Kokkos_ENABLE_OpenMP)
LIST( APPEND SOURCES TestOpenMP.cpp) LIST( APPEND SOURCES TestOpenMP.cpp)
ENDIF() ENDIF()
TRIBITS_ADD_EXECUTABLE_AND_TEST( # Per #374, we always want to build this test, but we only want to run
PerformanceTest # it as a PERFORMANCE test. That's why we separate building the test
# from running the test.
TRIBITS_ADD_EXECUTABLE(
PerfTestExec
SOURCES ${SOURCES} SOURCES ${SOURCES}
COMM serial mpi COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest TESTONLYLIBS kokkos_gtest
) )
TRIBITS_ADD_TEST(
PerformanceTest
NAME PerfTestExec
COMM serial mpi
NUM_MPI_PROCS 1
CATEGORIES PERFORMANCE
FAIL_REGULAR_EXPRESSION " FAILED "
)

View File

@ -54,6 +54,8 @@
#if defined( KOKKOS_HAVE_CUDA ) #if defined( KOKKOS_HAVE_CUDA )
#include <TestDynRankView.hpp>
#include <Kokkos_UnorderedMap.hpp> #include <Kokkos_UnorderedMap.hpp>
#include <TestGlobal2LocalIds.hpp> #include <TestGlobal2LocalIds.hpp>
@ -77,6 +79,13 @@ protected:
} }
}; };
TEST_F( cuda, dynrankview_perf )
{
std::cout << "Cuda" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Cuda>( 4096 );
}
TEST_F( cuda, global_2_local) TEST_F( cuda, global_2_local)
{ {
std::cout << "Cuda" << std::endl; std::cout << "Cuda" << std::endl;

View File

@ -0,0 +1,265 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
#ifndef KOKKOS_TEST_DYNRANKVIEW_HPP
#define KOKKOS_TEST_DYNRANKVIEW_HPP
#include <Kokkos_Core.hpp>
#include <Kokkos_DynRankView.hpp>
#include <vector>
#include <impl/Kokkos_Timer.hpp>
// Compare performance of DynRankView to View, specific focus on the parenthesis operators
namespace Performance {
//View functor
template <typename DeviceType>
struct InitViewFunctor {
typedef Kokkos::View<double***, DeviceType> inviewtype;
inviewtype _inview;
InitViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
struct SumComputationTest
{
typedef Kokkos::View<double***, DeviceType> inviewtype;
inviewtype _inview;
typedef Kokkos::View<double*, DeviceType> outviewtype;
outviewtype _outview;
KOKKOS_INLINE_FUNCTION
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_outview(i) += _inview(i,j,k) ;
}
}
}
};
};
template <typename DeviceType>
struct InitStrideViewFunctor {
typedef Kokkos::View<double***, Kokkos::LayoutStride, DeviceType> inviewtype;
inviewtype _inview;
InitStrideViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
};
template <typename DeviceType>
struct InitViewRank7Functor {
typedef Kokkos::View<double*******, DeviceType> inviewtype;
inviewtype _inview;
InitViewRank7Functor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k,0,0,0,0) = i/2 -j*j + k/3;
}
}
}
};
//DynRankView functor
template <typename DeviceType>
struct InitDynRankViewFunctor {
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
inviewtype _inview;
InitDynRankViewFunctor( inviewtype &inview_ ) : _inview(inview_)
{}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_inview(i,j,k) = i/2 -j*j + k/3;
}
}
}
struct SumComputationTest
{
typedef Kokkos::DynRankView<double, DeviceType> inviewtype;
inviewtype _inview;
typedef Kokkos::DynRankView<double, DeviceType> outviewtype;
outviewtype _outview;
KOKKOS_INLINE_FUNCTION
SumComputationTest(inviewtype &inview_ , outviewtype &outview_) : _inview(inview_), _outview(outview_) {}
KOKKOS_INLINE_FUNCTION
void operator()(const int i) const {
for (unsigned j = 0; j < _inview.dimension(1); ++j) {
for (unsigned k = 0; k < _inview.dimension(2); ++k) {
_outview(i) += _inview(i,j,k) ;
}
}
}
};
};
template <typename DeviceType>
void test_dynrankview_op_perf( const int par_size )
{
typedef DeviceType execution_space;
typedef typename execution_space::size_type size_type;
const size_type dim2 = 900;
const size_type dim3 = 300;
double elapsed_time_view = 0;
double elapsed_time_compview = 0;
double elapsed_time_strideview = 0;
double elapsed_time_view_rank7 = 0;
double elapsed_time_drview = 0;
double elapsed_time_compdrview = 0;
Kokkos::Timer timer;
{
Kokkos::View<double***,DeviceType> testview("testview",par_size,dim2,dim3);
typedef InitViewFunctor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
elapsed_time_view = timer.seconds();
std::cout << " View time (init only): " << elapsed_time_view << std::endl;
timer.reset();
Kokkos::View<double*,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testview, sumview) );
DeviceType::fence();
elapsed_time_compview = timer.seconds();
std::cout << " View sum computation time: " << elapsed_time_view << std::endl;
Kokkos::View<double***,Kokkos::LayoutStride, DeviceType> teststrideview = Kokkos::subview(testview, Kokkos::ALL, Kokkos::ALL,Kokkos::ALL);
typedef InitStrideViewFunctor<DeviceType> FunctorStrideType;
timer.reset();
Kokkos::parallel_for( policy , FunctorStrideType(teststrideview) );
DeviceType::fence();
elapsed_time_strideview = timer.seconds();
std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl;
}
{
Kokkos::View<double*******,DeviceType> testview("testview",par_size,dim2,dim3,1,1,1,1);
typedef InitViewRank7Functor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testview) );
DeviceType::fence();
elapsed_time_view_rank7 = timer.seconds();
std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl;
}
{
Kokkos::DynRankView<double,DeviceType> testdrview("testdrview",par_size,dim2,dim3);
typedef InitDynRankViewFunctor<DeviceType> FunctorType;
timer.reset();
Kokkos::RangePolicy<DeviceType> policy(0,par_size);
Kokkos::parallel_for( policy , FunctorType(testdrview) );
DeviceType::fence();
elapsed_time_drview = timer.seconds();
std::cout << " DynRankView time (init only): " << elapsed_time_drview << std::endl;
timer.reset();
Kokkos::DynRankView<double,DeviceType> sumview("sumview",par_size);
Kokkos::parallel_for( policy , typename FunctorType::SumComputationTest(testdrview, sumview) );
DeviceType::fence();
elapsed_time_compdrview = timer.seconds();
std::cout << " DynRankView sum computation time: " << elapsed_time_compdrview << std::endl;
}
std::cout << " Ratio of View to DynRankView time: " << elapsed_time_view / elapsed_time_drview << std::endl; //expect < 1
std::cout << " Ratio of View to DynRankView sum computation time: " << elapsed_time_compview / elapsed_time_compdrview << std::endl; //expect < 1
std::cout << " Ratio of View to View Rank7 time: " << elapsed_time_view / elapsed_time_view_rank7 << std::endl; //expect < 1
std::cout << " Ratio of StrideView to DynRankView time: " << elapsed_time_strideview / elapsed_time_drview << std::endl; //expect < 1
std::cout << " Ratio of DynRankView to View Rank7 time: " << elapsed_time_drview / elapsed_time_view_rank7 << std::endl; //expect ?
timer.reset();
} //end test_dynrankview
} //end Performance
#endif

View File

@ -178,7 +178,7 @@ void test_global_to_local_ids(unsigned num_ids)
std::cout << num_ids << ", "; std::cout << num_ids << ", ";
double elasped_time = 0; double elasped_time = 0;
Kokkos::Impl::Timer timer; Kokkos::Timer timer;
local_id_view local_2_global("local_ids", num_ids); local_id_view local_2_global("local_ids", num_ids);
global_id_view global_2_local((3u*num_ids)/2u); global_id_view global_2_local((3u*num_ids)/2u);

View File

@ -50,6 +50,8 @@
#include <TestGlobal2LocalIds.hpp> #include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp> #include <TestUnorderedMapPerformance.hpp>
#include <TestDynRankView.hpp>
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include <string> #include <string>
@ -91,6 +93,13 @@ protected:
} }
}; };
TEST_F( openmp, dynrankview_perf )
{
std::cout << "OpenMP" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::OpenMP>( 8192 );
}
TEST_F( openmp, global_2_local) TEST_F( openmp, global_2_local)
{ {
std::cout << "OpenMP" << std::endl; std::cout << "OpenMP" << std::endl;

View File

@ -52,6 +52,8 @@
#include <TestGlobal2LocalIds.hpp> #include <TestGlobal2LocalIds.hpp>
#include <TestUnorderedMapPerformance.hpp> #include <TestUnorderedMapPerformance.hpp>
#include <TestDynRankView.hpp>
#include <iomanip> #include <iomanip>
#include <sstream> #include <sstream>
#include <string> #include <string>
@ -85,6 +87,13 @@ protected:
} }
}; };
TEST_F( threads, dynrankview_perf )
{
std::cout << "Threads" << std::endl;
std::cout << " DynRankView vs View: Initialization Only " << std::endl;
test_dynrankview_op_perf<Kokkos::Threads>( 8192 );
}
TEST_F( threads, global_2_local) TEST_F( threads, global_2_local)
{ {
std::cout << "Threads" << std::endl; std::cout << "Threads" << std::endl;

View File

@ -80,7 +80,7 @@ struct UnorderedMapTest
, map(capacity) , map(capacity)
, histogram(map.get_histogram()) , histogram(map.get_histogram())
{ {
Kokkos::Impl::Timer wall_clock ; Kokkos::Timer wall_clock ;
wall_clock.reset(); wall_clock.reset();
value_type v = {}; value_type v = {};
@ -228,7 +228,7 @@ void run_performance_tests(std::string const & base_file_name)
distance_out << "\b\b\b " << std::endl; distance_out << "\b\b\b " << std::endl;
block_distance_out << "\b\b\b " << std::endl; block_distance_out << "\b\b\b " << std::endl;
Kokkos::Impl::Timer wall_clock ; Kokkos::Timer wall_clock ;
for (int i=0; i < num_collisions ; ++i) { for (int i=0; i < num_collisions ; ++i) {
wall_clock.reset(); wall_clock.reset();
std::cout << "Collisions: " << collisions[i] << std::endl; std::cout << "Collisions: " << collisions[i] << std::endl;

File diff suppressed because it is too large Load Diff

View File

@ -77,10 +77,7 @@ private:
public: public:
typedef Kokkos::Experimental::MemoryPool typedef Kokkos::Experimental::MemoryPool< typename traits::device_type > memory_pool ;
< typename traits::memory_space
, typename traits::execution_space
> memory_pool ;
private: private:
@ -338,7 +335,7 @@ public:
void operator()( unsigned i ) const void operator()( unsigned i ) const
{ {
if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) { if ( m_destroy && i < m_chunk_max && 0 != m_chunks[i] ) {
m_pool.deallocate( m_chunks[i] , m_pool.get_min_chunk_size() ); m_pool.deallocate( m_chunks[i] , m_pool.get_min_block_size() );
} }
m_chunks[i] = 0 ; m_chunks[i] = 0 ;
} }
@ -397,7 +394,7 @@ public:
// The memory pool chunk is guaranteed to be a power of two // The memory pool chunk is guaranteed to be a power of two
, m_chunk_shift( , m_chunk_shift(
Kokkos::Impl::integral_power_of_two( Kokkos::Impl::integral_power_of_two(
m_pool.get_min_chunk_size()/sizeof(typename traits::value_type)) ) m_pool.get_min_block_size()/sizeof(typename traits::value_type)) )
, m_chunk_mask( ( 1 << m_chunk_shift ) - 1 ) , m_chunk_mask( ( 1 << m_chunk_shift ) - 1 )
, m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift ) , m_chunk_max( ( arg_size_max + m_chunk_mask ) >> m_chunk_shift )
{ {

View File

@ -45,6 +45,7 @@
#define KOKKOS_BITSET_IMPL_HPP #define KOKKOS_BITSET_IMPL_HPP
#include <Kokkos_Macros.hpp> #include <Kokkos_Macros.hpp>
#include <impl/Kokkos_BitOps.hpp>
#include <stdint.h> #include <stdint.h>
#include <cstdio> #include <cstdio>
@ -52,82 +53,17 @@
#include <iostream> #include <iostream>
#include <iomanip> #include <iomanip>
namespace Kokkos { namespace Impl { namespace Kokkos {
namespace Impl {
KOKKOS_FORCEINLINE_FUNCTION KOKKOS_FORCEINLINE_FUNCTION
unsigned rotate_right(unsigned i, int r) unsigned rotate_right( unsigned i, int r )
{ {
enum { size = static_cast<int>(sizeof(unsigned)*CHAR_BIT) }; enum { size = static_cast<int>( sizeof(unsigned) * CHAR_BIT ) };
return r ? ((i >> r) | (i << (size-r))) : i ; return r ? ( ( i >> r ) | ( i << ( size - r ) ) ) : i ;
} }
KOKKOS_FORCEINLINE_FUNCTION template < typename Bitset >
int bit_scan_forward(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __ffs(i) - 1;
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_ffs(i) - 1;
#elif defined( __INTEL_COMPILER )
return _bit_scan_forward(i);
#else
unsigned t = 1u;
int r = 0;
while (i && (i & t == 0))
{
t = t << 1;
++r;
}
return r;
#endif
}
KOKKOS_FORCEINLINE_FUNCTION
int bit_scan_reverse(unsigned i)
{
enum { shift = static_cast<int>(sizeof(unsigned)*CHAR_BIT - 1) };
#if defined( __CUDA_ARCH__ )
return shift - __clz(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return shift - __builtin_clz(i);
#elif defined( __INTEL_COMPILER )
return _bit_scan_reverse(i);
#else
unsigned t = 1u << shift;
int r = 0;
while (i && (i & t == 0))
{
t = t >> 1;
++r;
}
return r;
#endif
}
// count the bits set
KOKKOS_FORCEINLINE_FUNCTION
int popcount(unsigned i)
{
#if defined( __CUDA_ARCH__ )
return __popc(i);
#elif defined( __GNUC__ ) || defined( __GNUG__ )
return __builtin_popcount(i);
#elif defined ( __INTEL_COMPILER )
return _popcnt32(i);
#else
// http://graphics.stanford.edu/~seander/bithacks.html#CountBitsSetNaive
i = i - ((i >> 1) & ~0u/3u); // temp
i = (i & ~0u/15u*3u) + ((i >> 2) & ~0u/15u*3u); // temp
i = (i + (i >> 4)) & ~0u/255u*15u; // temp
return (int)((i * (~0u/255u)) >> (sizeof(unsigned) - 1) * CHAR_BIT); // count
#endif
}
template <typename Bitset>
struct BitsetCount struct BitsetCount
{ {
typedef Bitset bitset_type; typedef Bitset bitset_type;
@ -137,37 +73,37 @@ struct BitsetCount
bitset_type m_bitset; bitset_type m_bitset;
BitsetCount( bitset_type const& bitset) BitsetCount( bitset_type const& bitset )
: m_bitset(bitset) : m_bitset(bitset)
{} {}
size_type apply() const size_type apply() const
{ {
size_type count = 0u; size_type count = 0u;
parallel_reduce(m_bitset.m_blocks.dimension_0(), *this, count); parallel_reduce( m_bitset.m_blocks.dimension_0(), *this, count );
return count; return count;
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
static void init( value_type & count) void init( value_type & count ) const
{ {
count = 0u; count = 0u;
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
static void join( volatile value_type & count, const volatile size_type & incr ) void join( volatile value_type & count, const volatile size_type & incr ) const
{ {
count += incr; count += incr;
} }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void operator()( size_type i, value_type & count) const void operator()( size_type i, value_type & count ) const
{ {
count += popcount(m_bitset.m_blocks[i]); count += bit_count( m_bitset.m_blocks[i] );
} }
}; };
}} //Kokkos::Impl } // namespace Impl
} // namespace Kokkos
#endif // KOKKOS_BITSET_IMPL_HPP #endif // KOKKOS_BITSET_IMPL_HPP

View File

@ -713,13 +713,20 @@ public:
typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ; typedef Kokkos::Experimental::DynRankView< const T , device > const_dView0 ;
typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ; typedef Kokkos::Experimental::DynRankView< T, device, Kokkos::MemoryUnmanaged > dView0_unmanaged ;
typedef typename dView0::host_mirror_space host ; typedef typename dView0::host_mirror_space host_drv_space ;
typedef Kokkos::Experimental::View< T , device > View0 ;
typedef Kokkos::Experimental::View< T* , device > View1 ;
typedef Kokkos::Experimental::View< T******* , device > View7 ;
typedef typename View0::host_mirror_space host_view_space ;
TestDynViewAPI() TestDynViewAPI()
{ {
run_test_resize_realloc();
run_test_mirror(); run_test_mirror();
run_test();
run_test_scalar(); run_test_scalar();
run_test();
run_test_const(); run_test_const();
run_test_subview(); run_test_subview();
run_test_subview_strided(); run_test_subview_strided();
@ -735,19 +742,147 @@ public:
TestViewOperator_LeftAndRight< int , device , 1 >::testit(2); TestViewOperator_LeftAndRight< int , device , 1 >::testit(2);
} }
static void run_test_resize_realloc()
{
dView0 drv0("drv0", 10, 20, 30);
ASSERT_EQ( drv0.rank(), 3);
Kokkos::Experimental::resize(drv0, 5, 10);
ASSERT_EQ( drv0.rank(), 2);
ASSERT_EQ( drv0.dimension_0(), 5);
ASSERT_EQ( drv0.dimension_1(), 10);
ASSERT_EQ( drv0.dimension_2(), 1);
Kokkos::Experimental::realloc(drv0, 10, 20);
ASSERT_EQ( drv0.rank(), 2);
ASSERT_EQ( drv0.dimension_0(), 10);
ASSERT_EQ( drv0.dimension_1(), 20);
ASSERT_EQ( drv0.dimension_2(), 1);
}
static void run_test_mirror() static void run_test_mirror()
{ {
typedef Kokkos::Experimental::DynRankView< int , host > view_type ; typedef Kokkos::Experimental::DynRankView< int , host_drv_space > view_type ;
typedef typename view_type::HostMirror mirror_type ; typedef typename view_type::HostMirror mirror_type ;
view_type a("a"); view_type a("a");
mirror_type am = Kokkos::Experimental::create_mirror_view(a); mirror_type am = Kokkos::Experimental::create_mirror_view(a);
mirror_type ax = Kokkos::Experimental::create_mirror(a); mirror_type ax = Kokkos::Experimental::create_mirror(a);
ASSERT_EQ( & a() , & am() ); ASSERT_EQ( & a() , & am() );
ASSERT_EQ( a.rank() , am.rank() );
ASSERT_EQ( ax.rank() , am.rank() );
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
ASSERT_EQ(equal_ptr_h_h2,0);
ASSERT_EQ(equal_ptr_h_d ,0);
ASSERT_EQ(equal_ptr_h2_d,0);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = (a_h.data() ==a_h2.data())?1:0;
int equal_ptr_h_d = (a_h.data() ==a_d. data())?1:0;
int equal_ptr_h2_d = (a_h2.data()==a_d. data())?1:0;
ASSERT_EQ(equal_ptr_h_h2,0);
ASSERT_EQ(equal_ptr_h_d ,0);
ASSERT_EQ(equal_ptr_h2_d,0);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutLeft, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
Kokkos::DynRankView<double, Kokkos::LayoutRight, Kokkos::HostSpace> a_h("A",1000);
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
if (Kokkos::HostSpace::execution_space::is_initialized() )
{
typedef Kokkos::DynRankView< int , Kokkos::LayoutStride , Kokkos::HostSpace > view_stride_type ;
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
view_stride_type a_h( "a" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
auto a_h2 = Kokkos::create_mirror_view(Kokkos::HostSpace(),a_h);
auto a_d = Kokkos::create_mirror_view(typename device::memory_space(),a_h);
int equal_ptr_h_h2 = a_h.data() ==a_h2.data()?1:0;
int equal_ptr_h_d = a_h.data() ==a_d. data()?1:0;
int equal_ptr_h2_d = a_h2.data()==a_d. data()?1:0;
int is_same_memspace = std::is_same<Kokkos::HostSpace,typename device::memory_space>::value?1:0;
ASSERT_EQ(equal_ptr_h_h2,1);
ASSERT_EQ(equal_ptr_h_d ,is_same_memspace);
ASSERT_EQ(equal_ptr_h2_d ,is_same_memspace);
ASSERT_EQ(a_h.dimension_0(),a_h2.dimension_0());
ASSERT_EQ(a_h.dimension_0(),a_d .dimension_0());
ASSERT_EQ(a_h.rank(),a_h2.rank());
ASSERT_EQ(a_h.rank(),a_d.rank());
}
} }
static void run_test_scalar() static void run_test_scalar()
{ {
typedef typename dView0::HostMirror hView0 ; typedef typename dView0::HostMirror hView0 ; //HostMirror of DynRankView is a DynRankView
dView0 dx , dy ; dView0 dx , dy ;
hView0 hx , hy ; hView0 hx , hy ;
@ -765,6 +900,79 @@ public:
Kokkos::Experimental::deep_copy( hy , dy ); Kokkos::Experimental::deep_copy( hy , dy );
ASSERT_EQ( hx(), hy() ); ASSERT_EQ( hx(), hy() );
ASSERT_EQ( dx.rank() , hx.rank() );
ASSERT_EQ( dy.rank() , hy.rank() );
//View - DynRankView Interoperability tests
// deep_copy DynRankView to View
View0 vx("vx");
Kokkos::deep_copy( vx , dx );
ASSERT_EQ( rank(dx) , rank(vx) );
View0 vy("vy");
Kokkos::deep_copy( vy , dy );
ASSERT_EQ( rank(dy) , rank(vy) );
// deep_copy View to DynRankView
dView0 dxx("dxx");
Kokkos::deep_copy( dxx , vx );
ASSERT_EQ( rank(dxx) , rank(vx) );
View7 vcast = dx.ConstDownCast();
ASSERT_EQ( dx.dimension_0() , vcast.dimension_0() );
ASSERT_EQ( dx.dimension_1() , vcast.dimension_1() );
ASSERT_EQ( dx.dimension_2() , vcast.dimension_2() );
ASSERT_EQ( dx.dimension_3() , vcast.dimension_3() );
ASSERT_EQ( dx.dimension_4() , vcast.dimension_4() );
View7 vcast1( dy.ConstDownCast() );
ASSERT_EQ( dy.dimension_0() , vcast1.dimension_0() );
ASSERT_EQ( dy.dimension_1() , vcast1.dimension_1() );
ASSERT_EQ( dy.dimension_2() , vcast1.dimension_2() );
ASSERT_EQ( dy.dimension_3() , vcast1.dimension_3() );
ASSERT_EQ( dy.dimension_4() , vcast1.dimension_4() );
//View - DynRankView Interoperability tests
// copy View to DynRankView
dView0 dfromvx( vx );
auto hmx = Kokkos::create_mirror_view(dfromvx) ;
Kokkos::deep_copy(hmx , dfromvx);
auto hvx = Kokkos::create_mirror_view(vx) ;
Kokkos::deep_copy(hvx , vx);
ASSERT_EQ( rank(hvx) , rank(hmx) );
ASSERT_EQ( hvx.dimension_0() , hmx.dimension_0() );
ASSERT_EQ( hvx.dimension_1() , hmx.dimension_1() );
// copy-assign View to DynRankView
dView0 dfromvy = vy ;
auto hmy = Kokkos::create_mirror_view(dfromvy) ;
Kokkos::deep_copy(hmy , dfromvy);
auto hvy = Kokkos::create_mirror_view(vy) ;
Kokkos::deep_copy(hvy , vy);
ASSERT_EQ( rank(hvy) , rank(hmy) );
ASSERT_EQ( hvy.dimension_0() , hmy.dimension_0() );
ASSERT_EQ( hvy.dimension_1() , hmy.dimension_1() );
View7 vtest1("vtest1",2,2,2,2,2,2,2);
dView0 dfromv1( vtest1 );
ASSERT_EQ( dfromv1.rank() , vtest1.Rank );
ASSERT_EQ( dfromv1.dimension_0() , vtest1.dimension_0() );
ASSERT_EQ( dfromv1.dimension_1() , vtest1.dimension_1() );
ASSERT_EQ( dfromv1.use_count() , vtest1.use_count() );
dView0 dfromv2( vcast );
ASSERT_EQ( dfromv2.rank() , vcast.Rank );
ASSERT_EQ( dfromv2.dimension_0() , vcast.dimension_0() );
ASSERT_EQ( dfromv2.dimension_1() , vcast.dimension_1() );
ASSERT_EQ( dfromv2.use_count() , vcast.use_count() );
dView0 dfromv3 = vcast1;
ASSERT_EQ( dfromv3.rank() , vcast1.Rank );
ASSERT_EQ( dfromv3.dimension_0() , vcast1.dimension_0() );
ASSERT_EQ( dfromv3.dimension_1() , vcast1.dimension_1() );
ASSERT_EQ( dfromv3.use_count() , vcast1.use_count() );
} }
static void run_test() static void run_test()
@ -782,22 +990,32 @@ public:
(void) thing; (void) thing;
} }
dView0 d_uninitialized(Kokkos::ViewAllocateWithoutInitializing("uninit"),10,20);
ASSERT_TRUE( d_uninitialized.data() != nullptr );
ASSERT_EQ( d_uninitialized.rank() , 2 );
ASSERT_EQ( d_uninitialized.dimension_0() , 10 );
ASSERT_EQ( d_uninitialized.dimension_1() , 20 );
ASSERT_EQ( d_uninitialized.dimension_2() , 1 );
dView0 dx , dy , dz ; dView0 dx , dy , dz ;
hView0 hx , hy , hz ; hView0 hx , hy , hz ;
ASSERT_TRUE( dx.ptr_on_device() == 0 ); ASSERT_TRUE( Kokkos::Experimental::is_dyn_rank_view<dView0>::value );
ASSERT_TRUE( dy.ptr_on_device() == 0 ); ASSERT_FALSE( Kokkos::Experimental::is_dyn_rank_view< Kokkos::View<double> >::value );
ASSERT_TRUE( dz.ptr_on_device() == 0 );
ASSERT_TRUE( dx.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( dy.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( dz.ptr_on_device() == 0 ); //Okay with UVM
ASSERT_TRUE( hx.ptr_on_device() == 0 ); ASSERT_TRUE( hx.ptr_on_device() == 0 );
ASSERT_TRUE( hy.ptr_on_device() == 0 ); ASSERT_TRUE( hy.ptr_on_device() == 0 );
ASSERT_TRUE( hz.ptr_on_device() == 0 ); ASSERT_TRUE( hz.ptr_on_device() == 0 );
ASSERT_EQ( dx.dimension_0() , 0u ); ASSERT_EQ( dx.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( dy.dimension_0() , 0u ); ASSERT_EQ( dy.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( dz.dimension_0() , 0u ); ASSERT_EQ( dz.dimension_0() , 0u ); //Okay with UVM
ASSERT_EQ( hx.dimension_0() , 0u ); ASSERT_EQ( hx.dimension_0() , 0u );
ASSERT_EQ( hy.dimension_0() , 0u ); ASSERT_EQ( hy.dimension_0() , 0u );
ASSERT_EQ( hz.dimension_0() , 0u ); ASSERT_EQ( hz.dimension_0() , 0u );
ASSERT_EQ( dx.rank() , 0u ); ASSERT_EQ( dx.rank() , 0u ); //Okay with UVM
ASSERT_EQ( hx.rank() , 0u ); ASSERT_EQ( hx.rank() , 0u );
dx = dView0( "dx" , N1 , N2 , N3 ); dx = dView0( "dx" , N1 , N2 , N3 );
@ -806,11 +1024,11 @@ public:
hx = hView0( "hx" , N1 , N2 , N3 ); hx = hView0( "hx" , N1 , N2 , N3 );
hy = hView0( "hy" , N1 , N2 , N3 ); hy = hView0( "hy" , N1 , N2 , N3 );
ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); ASSERT_EQ( dx.dimension_0() , unsigned(N1) ); //Okay with UVM
ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); ASSERT_EQ( dy.dimension_0() , unsigned(N1) ); //Okay with UVM
ASSERT_EQ( hx.dimension_0() , unsigned(N1) ); ASSERT_EQ( hx.dimension_0() , unsigned(N1) );
ASSERT_EQ( hy.dimension_0() , unsigned(N1) ); ASSERT_EQ( hy.dimension_0() , unsigned(N1) );
ASSERT_EQ( dx.rank() , 3 ); ASSERT_EQ( dx.rank() , 3 ); //Okay with UVM
ASSERT_EQ( hx.rank() , 3 ); ASSERT_EQ( hx.rank() , 3 );
dx = dView0( "dx" , N0 , N1 , N2 , N3 ); dx = dView0( "dx" , N0 , N1 , N2 , N3 );
@ -823,19 +1041,23 @@ public:
ASSERT_EQ( hx.dimension_0() , unsigned(N0) ); ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
ASSERT_EQ( hy.dimension_0() , unsigned(N0) ); ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
ASSERT_EQ( dx.rank() , 4 ); ASSERT_EQ( dx.rank() , 4 );
ASSERT_EQ( dy.rank() , 4 );
ASSERT_EQ( hx.rank() , 4 ); ASSERT_EQ( hx.rank() , 4 );
ASSERT_EQ( hy.rank() , 4 );
ASSERT_EQ( dx.use_count() , size_t(1) ); ASSERT_EQ( dx.use_count() , size_t(1) );
dView0_unmanaged unmanaged_dx = dx; dView0_unmanaged unmanaged_dx = dx;
ASSERT_EQ( dx.use_count() , size_t(1) ); ASSERT_EQ( dx.use_count() , size_t(1) );
dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(), dView0_unmanaged unmanaged_from_ptr_dx = dView0_unmanaged(dx.ptr_on_device(),
dx.dimension_0(), dx.dimension_0(),
dx.dimension_1(), dx.dimension_1(),
dx.dimension_2(), dx.dimension_2(),
dx.dimension_3()); dx.dimension_3());
{ {
// Destruction of this view should be harmless // Destruction of this view should be harmless
const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() , const_dView0 unmanaged_from_ptr_const_dx( dx.ptr_on_device() ,
@ -888,6 +1110,19 @@ public:
hx = Kokkos::Experimental::create_mirror( dx ); hx = Kokkos::Experimental::create_mirror( dx );
hy = Kokkos::Experimental::create_mirror( dy ); hy = Kokkos::Experimental::create_mirror( dy );
ASSERT_EQ( hx.rank() , dx.rank() );
ASSERT_EQ( hy.rank() , dy.rank() );
ASSERT_EQ( hx.dimension_0() , unsigned(N0) );
ASSERT_EQ( hx.dimension_1() , unsigned(N1) );
ASSERT_EQ( hx.dimension_2() , unsigned(N2) );
ASSERT_EQ( hx.dimension_3() , unsigned(N3) );
ASSERT_EQ( hy.dimension_0() , unsigned(N0) );
ASSERT_EQ( hy.dimension_1() , unsigned(N1) );
ASSERT_EQ( hy.dimension_2() , unsigned(N2) );
ASSERT_EQ( hy.dimension_3() , unsigned(N3) );
// T v1 = hx() ; // Generates compile error as intended // T v1 = hx() ; // Generates compile error as intended
// T v2 = hx(0,0) ; // Generates compile error as intended // T v2 = hx(0,0) ; // Generates compile error as intended
// hx(0,0) = v2 ; // Generates compile error as intended // hx(0,0) = v2 ; // Generates compile error as intended
@ -990,7 +1225,9 @@ public:
for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) { for ( size_t i3 = 0 ; i3 < N3 ; ++i3 ) {
{ ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); } { ASSERT_EQ( hx(ip,i1,i2,i3) , T(0) ); }
}}}} }}}}
// ASSERT_EQ( hx(0,0,0,0,0,0,0,0) , T(0) ); //Test rank8 op behaves properly - if implemented
} }
dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz); dz = dx ; ASSERT_EQ( dx, dz); ASSERT_NE( dy, dz);
dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz); dz = dy ; ASSERT_EQ( dy, dz); ASSERT_NE( dx, dz);
@ -1006,6 +1243,35 @@ public:
ASSERT_TRUE( dx.ptr_on_device() == 0 ); ASSERT_TRUE( dx.ptr_on_device() == 0 );
ASSERT_TRUE( dy.ptr_on_device() == 0 ); ASSERT_TRUE( dy.ptr_on_device() == 0 );
ASSERT_TRUE( dz.ptr_on_device() == 0 ); ASSERT_TRUE( dz.ptr_on_device() == 0 );
//View - DynRankView Interoperability tests
// deep_copy from view to dynrankview
const int testdim = 4;
dView0 dxx("dxx",testdim);
View1 vxx("vxx",testdim);
auto hvxx = Kokkos::create_mirror_view(vxx);
for (int i = 0; i < testdim; ++i)
{ hvxx(i) = i; }
Kokkos::deep_copy(vxx,hvxx);
Kokkos::deep_copy(dxx,vxx);
auto hdxx = Kokkos::create_mirror_view(dxx);
Kokkos::deep_copy(hdxx,dxx);
for (int i = 0; i < testdim; ++i)
{ ASSERT_EQ( hvxx(i) , hdxx(i) ); }
ASSERT_EQ( rank(hdxx) , rank(hvxx) );
ASSERT_EQ( hdxx.dimension_0() , testdim );
ASSERT_EQ( hdxx.dimension_0() , hvxx.dimension_0() );
// deep_copy from dynrankview to view
View1 vdxx("vdxx",testdim);
auto hvdxx = Kokkos::create_mirror_view(vdxx);
Kokkos::deep_copy(hvdxx , hdxx);
ASSERT_EQ( rank(hdxx) , rank(hvdxx) );
ASSERT_EQ( hvdxx.dimension_0() , testdim );
ASSERT_EQ( hdxx.dimension_0() , hvdxx.dimension_0() );
for (int i = 0; i < testdim; ++i)
{ ASSERT_EQ( hvxx(i) , hvdxx(i) ); }
} }
typedef T DataType ; typedef T DataType ;
@ -1059,35 +1325,66 @@ public:
// N0 = 1000,N1 = 3,N2 = 5,N3 = 7 // N0 = 1000,N1 = 3,N2 = 5,N3 = 7
unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent unsigned order[] = { 6,5,4,3,2,1,0 }, dimen[] = { N0, N1, N2, 2, 2, 2, 2 }; //LayoutRight equivalent
sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) ); sdView d7( "d7" , Kokkos::LayoutStride::order_dimensions(7, order, dimen) );
ASSERT_EQ( d7.rank() , 7 );
sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 ); //Should be rank0 subview sdView ds0 = Kokkos::subdynrankview( d7 , 1 , 1 , 1 , 1 , 1 , 1 , 1 );
ASSERT_EQ( ds0.rank() , 0 );
//Basic test - ALL //Basic test - ALL
sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() ); //compiles and runs sdView dsALL = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() );
ASSERT_EQ( dsALL.rank() , 7 );
// Send a single value for one rank // Send a value to final rank returning rank 6 subview
sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 ); sdView dsm1 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , 1 );
ASSERT_EQ( dsm1.rank() , 6 );
// Send a std::pair as a rank // Send a std::pair as argument to a rank
sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) ); sdView dssp = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , std::pair<unsigned,unsigned>(1,2) );
ASSERT_EQ( dssp.rank() , 7 );
// Send a kokkos::pair as a rank; take default layout as input // Send a kokkos::pair as argument to a rank; take default layout as input
dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout dView0 dd0("dd0" , N0 , N1 , N2 , 2 , 2 , 2 , 2 ); //default layout
ASSERT_EQ( dd0.rank() , 7 );
sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); sdView dtkp = Kokkos::Experimental::subdynrankview( dd0 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( dtkp.rank() , 7 );
// Return rank 7 subview, taking a pair as one argument, layout stride input // Return rank 7 subview, taking a pair as one argument, layout stride input
sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); sdView ds7 = Kokkos::Experimental::subdynrankview( d7 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( ds7.rank() , 7 );
// Default Layout DynRankView // Default Layout DynRankView
dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 ); dView dv6("dv6" , N0 , N1 , N2 , N3 , 2 , 2 );
ASSERT_EQ( dv6.rank() , 6 );
// DynRankView with LayoutRight // DynRankView with LayoutRight
typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ; typedef Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , device > drView ;
drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 ); drView dr5( "dr5" , N0 , N1 , N2 , 2 , 2 );
ASSERT_EQ( dr5.rank() , 5 );
// LayoutStride but arranged as LayoutRight // LayoutStride but arranged as LayoutRight
unsigned order3[] = { 4,3,2,1,0 }, dimen3[] = { N0, N1, N2, 2, 2 }; // NOTE: unused arg_layout dimensions must be set to ~size_t(0) so that
sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order3, dimen3) ); // rank deduction can properly take place
unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
Kokkos::LayoutStride ls = Kokkos::LayoutStride::order_dimensions(5, order5, dimen5);
ls.dimension[5] = ~size_t(0);
ls.dimension[6] = ~size_t(0);
ls.dimension[7] = ~size_t(0);
sdView d5("d5", ls);
ASSERT_EQ( d5.rank() , 5 );
// LayoutStride arranged as LayoutRight - commented out as example that fails unit test
// unsigned order5[] = { 4,3,2,1,0 }, dimen5[] = { N0, N1, N2, 2, 2 };
// sdView d5( "d5" , Kokkos::LayoutStride::order_dimensions(5, order5, dimen5) );
//
// Fails the following unit test:
// ASSERT_EQ( d5.rank() , dr5.rank() );
//
// Explanation: In construction of the Kokkos::LayoutStride below, since the
// remaining dimensions are not specified, they will default to values of 0
// rather than ~size_t(0).
// When passed to the DynRankView constructor the default dimensions (of 0)
// will be counted toward the dynamic rank and returning an incorrect value
// (i.e. rank 7 rather than 5).
// Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should) // Check LayoutRight dr5 and LayoutStride d5 dimensions agree (as they should)
ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() ); ASSERT_EQ( d5.dimension_0() , dr5.dimension_0() );
@ -1100,21 +1397,21 @@ public:
// Rank 5 subview of rank 5 dynamic rank view, layout stride input // Rank 5 subview of rank 5 dynamic rank view, layout stride input
sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) ); sdView ds5 = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) );
ASSERT_EQ( ds5.rank() , 5 );
// Pass in extra ALL arguments beyond the rank of the DynRank View. // Pass in extra ALL arguments beyond the rank of the DynRank View.
// This behavior is allowed - ignore the extra ALL arguments when // This behavior is allowed - ignore the extra ALL arguments when
// the src.rank() < number of arguments, but be careful! // the src.rank() < number of arguments, but be careful!
sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() ); sdView ds5plus = Kokkos::Experimental::subdynrankview( d5 , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::ALL() , Kokkos::pair<unsigned,unsigned>(0,1) , Kokkos::ALL() );
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() ); ASSERT_EQ( ds5.dimension_0() , ds5plus.dimension_0() );
ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() ); ASSERT_EQ( ds5.dimension_4() , ds5plus.dimension_4() );
ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() ); ASSERT_EQ( ds5.dimension_5() , ds5plus.dimension_5() );
ASSERT_EQ( ds5.rank() , ds5plus.rank() );
ASSERT_EQ( ds5.rank() , 5 );
#if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM ) #if ! defined( KOKKOS_HAVE_CUDA ) || defined ( KOKKOS_USE_CUDA_UVM )
ASSERT_EQ( & ds5(1,1,1,1) - & ds5plus(1,1,1,1) , 0 );
ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 ); ASSERT_EQ( & ds5(1,1,1,1,0) - & ds5plus(1,1,1,1,0) , 0 );
ASSERT_EQ( & ds5(1,1,1,1,0,0) - & ds5plus(1,1,1,1,0,0) , 0 ); // passing argument to rank beyond the view's rank is allowed iff it is a 0.
#endif #endif
// Similar test to rank 5 above, but create rank 4 subview // Similar test to rank 5 above, but create rank 4 subview
@ -1131,9 +1428,9 @@ public:
static void run_test_subview_strided() static void run_test_subview_strided()
{ {
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host > drview_left ; typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutLeft , host_drv_space > drview_left ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host > drview_right ; typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutRight , host_drv_space > drview_right ;
typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host > drview_stride ; typedef Kokkos::Experimental::DynRankView < int , Kokkos::LayoutStride , host_drv_space > drview_stride ;
drview_left xl2( "xl2", 100 , 200 ); drview_left xl2( "xl2", 100 , 200 );
drview_right xr2( "xr2", 100 , 200 ); drview_right xr2( "xr2", 100 , 200 );
@ -1159,35 +1456,37 @@ public:
drview_left xl4( "xl4", 10 , 20 , 30 , 40 ); drview_left xl4( "xl4", 10 , 20 , 30 , 40 );
drview_right xr4( "xr4", 10 , 20 , 30 , 40 ); drview_right xr4( "xr4", 10 , 20 , 30 , 40 );
drview_stride yl4 = Kokkos::Experimental::subdynrankview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); //Replace subdynrankview with subview - test
drview_stride yr4 = Kokkos::Experimental::subdynrankview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() ); drview_stride yl4 = Kokkos::Experimental::subview( xl4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
drview_stride yr4 = Kokkos::Experimental::subview( xr4 , 1 , Kokkos::ALL() , 2 , Kokkos::ALL() );
ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() ); ASSERT_EQ( yl4.dimension_0() , xl4.dimension_1() );
ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() ); ASSERT_EQ( yl4.dimension_1() , xl4.dimension_3() );
ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() ); ASSERT_EQ( yr4.dimension_0() , xr4.dimension_1() );
ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() ); ASSERT_EQ( yr4.dimension_1() , xr4.dimension_3() );
ASSERT_EQ( yl4.rank() , 2);
ASSERT_EQ( yr4.rank() , 2);
ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 ); ASSERT_EQ( & yl4(4,4) - & xl4(1,4,2,4) , 0 );
ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 ); ASSERT_EQ( & yr4(4,4) - & xr4(1,4,2,4) , 0 );
} }
static void run_test_vector() static void run_test_vector()
{ {
static const unsigned Length = 1000 , Count = 8 ; static const unsigned Length = 1000 , Count = 8 ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host > multivector_type ; typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutLeft , host_drv_space > multivector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host > multivector_right_type ; typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutRight , host_drv_space > multivector_right_type ;
multivector_type mv = multivector_type( "mv" , Length , Count ); multivector_type mv = multivector_type( "mv" , Length , Count );
multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count ); multivector_right_type mv_right = multivector_right_type( "mv" , Length , Count );
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > svector_type ; typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > svector_type ;
typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host > smultivector_type ; typedef typename Kokkos::Experimental::DynRankView< T , Kokkos::LayoutStride , host_drv_space > smultivector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_right_type ; //LayoutStride, not right; setup to match original ViewAPI calls... update typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_right_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_svector_type ; typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_svector_type ;
typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host > const_smultivector_type ; typedef typename Kokkos::Experimental::DynRankView< const T , Kokkos::LayoutStride , host_drv_space > const_smultivector_type ;
svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 ); svector_type v1 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 0 );
svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 ); svector_type v2 = Kokkos::Experimental::subdynrankview( mv , Kokkos::ALL() , 1 );
@ -1251,7 +1550,6 @@ public:
const_smultivector_type cmv( mv ); const_smultivector_type cmv( mv );
typename smultivector_type::const_type cmvX( cmv ); typename smultivector_type::const_type cmvX( cmv );
typename const_smultivector_type::const_type ccmvX( cmv ); typename const_smultivector_type::const_type ccmvX( cmv );
} }
}; };

View File

@ -61,8 +61,7 @@ struct TestDynamicView
typedef typename Space::execution_space execution_space ; typedef typename Space::execution_space execution_space ;
typedef typename Space::memory_space memory_space ; typedef typename Space::memory_space memory_space ;
typedef Kokkos::Experimental::MemoryPool< memory_space , execution_space > typedef Kokkos::Experimental::MemoryPool<typename Space::device_type> memory_pool_type;
memory_pool_type ;
typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type; typedef Kokkos::Experimental::DynamicView<Scalar*,Space> view_type;
@ -129,11 +128,9 @@ struct TestDynamicView
typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ; typedef Kokkos::TeamPolicy<execution_space,TEST> TestPolicy ;
typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ; typedef Kokkos::TeamPolicy<execution_space,VERIFY> VerifyPolicy ;
const unsigned int chunk_size = 1024 ;
// printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size); // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size);
memory_pool_type pool( memory_space() , chunk_size , arg_total_size * sizeof(Scalar) ); memory_pool_type pool( memory_space() , arg_total_size * sizeof(Scalar) * 1.2 );
// printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size); // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size);

View File

@ -34,6 +34,7 @@
#cmakedefine KOKKOS_HAVE_Winthread #cmakedefine KOKKOS_HAVE_Winthread
#cmakedefine KOKKOS_HAVE_OPENMP #cmakedefine KOKKOS_HAVE_OPENMP
#cmakedefine KOKKOS_HAVE_HWLOC #cmakedefine KOKKOS_HAVE_HWLOC
#cmakedefine KOKKOS_HAVE_DEBUG
#cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK #cmakedefine KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK
#cmakedefine KOKKOS_HAVE_CXX11 #cmakedefine KOKKOS_HAVE_CXX11
#cmakedefine KOKKOS_HAVE_CUSPARSE #cmakedefine KOKKOS_HAVE_CUSPARSE

View File

@ -8,11 +8,22 @@ SET(SOURCES
PerfTestCuda.cpp PerfTestCuda.cpp
) )
TRIBITS_ADD_EXECUTABLE_AND_TEST( # Per #374, we always want to build this test, but we only want to run
PerfTest # it as a PERFORMANCE test. That's why we separate building the test
# from running the test.
TRIBITS_ADD_EXECUTABLE(
PerfTestExec
SOURCES ${SOURCES} SOURCES ${SOURCES}
COMM serial mpi COMM serial mpi
NUM_MPI_PROCS 1
FAIL_REGULAR_EXPRESSION " FAILED "
TESTONLYLIBS kokkos_gtest TESTONLYLIBS kokkos_gtest
) )
TRIBITS_ADD_EXECUTABLE_AND_TEST(
PerfTest
NAME PerfTestExec
COMM serial mpi
NUM_MPI_PROCS 1
CATEGORIES PERFORMANCE
FAIL_REGULAR_EXPRESSION " FAILED "
)

View File

@ -159,7 +159,7 @@ struct TextureFetch
Kokkos::Cuda::fence(); Kokkos::Cuda::fence();
Kokkos::Impl::Timer timer; Kokkos::Timer timer;
for (int j=0; j<10; ++j) { for (int j=0; j<10; ++j) {
RandomReduce f(array,indexes); RandomReduce f(array,indexes);
f.apply(reduce); f.apply(reduce);

View File

@ -153,7 +153,7 @@ struct ModifiedGramSchmidt
Kokkos::deep_copy( one , (Scalar) 1 ); Kokkos::deep_copy( one , (Scalar) 1 );
Kokkos::Impl::Timer timer ; Kokkos::Timer timer ;
for ( size_type j = 0 ; j < count ; ++j ) { for ( size_type j = 0 ; j < count ; ++j ) {
// Reduction : tmp = dot( Q(:,j) , Q(:,j) ); // Reduction : tmp = dot( Q(:,j) , Q(:,j) );

View File

@ -252,7 +252,7 @@ struct HexGrad
execution_space::fence(); execution_space::fence();
for ( int i = 0 ; i < iter ; ++i ) { for ( int i = 0 ; i < iter ; ++i ) {
Kokkos::Impl::Timer timer ; Kokkos::Timer timer ;
Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) ); Kokkos::parallel_for( count , HexGrad<execution_space>( coord , grad ) );
execution_space::fence(); execution_space::fence();
const double dt = timer.seconds(); const double dt = timer.seconds();

View File

@ -414,24 +414,27 @@ void Loop(int loop, int test, const char* type_name) {
Kokkos::Impl::Timer timer; Kokkos::Impl::Timer timer;
T res = LoopVariant<T>(loop,test); T res = LoopVariant<T>(loop,test);
double time1 = timer.seconds(); double time = timer.seconds();
timer.reset(); timer.reset();
T resNonAtomic = LoopVariantNonAtomic<T>(loop,test); T resNonAtomic = LoopVariantNonAtomic<T>(loop,test);
double time2 = timer.seconds(); double timeNonAtomic = timer.seconds();
timer.reset(); timer.reset();
T resSerial = LoopVariantSerial<T>(loop,test); T resSerial = LoopVariantSerial<T>(loop,test);
double time3 = timer.seconds(); double timeSerial = timer.seconds();
time1*=1e6/loop; time *=1e6/loop;
time2*=1e6/loop; timeNonAtomic*=1e6/loop;
time3*=1e6/loop; timeSerial *=1e6/loop;
//textcolor_standard(); //textcolor_standard();
bool passed = true; bool passed = true;
if(resSerial!=res) passed = false; if(resSerial!=res) passed = false;
//if(!passed) textcolor(RESET,BLACK,YELLOW); //if(!passed) textcolor(RESET,BLACK,YELLOW);
printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",type_name,test,passed?"PASSED":"FAILED",loop,1.0*resSerial,1.0*res,1.0*resNonAtomic,time1,time2,time3,(int)sizeof(T)); printf("%s Test %i %s --- Loop: %i Value (S,A,NA): %e %e %e Time: %7.4e %7.4e %7.4e Size of Type %i)",
type_name,test,passed?"PASSED":"FAILED",loop,
1.0*resSerial,1.0*res,1.0*resNonAtomic,
timeSerial,time,timeNonAtomic,(int)sizeof(T));
//if(!passed) textcolor_standard(); //if(!passed) textcolor_standard();
printf("\n"); printf("\n");
} }
@ -452,7 +455,7 @@ void Test(int loop, int test, const char* type_name) {
int main(int argc, char* argv[]) int main(int argc, char* argv[])
{ {
int type = -1; int type = -1;
int loop = 1000000; int loop = 100000;
int test = -1; int test = -1;
for(int i=0;i<argc;i++) for(int i=0;i<argc;i++)

View File

@ -124,15 +124,31 @@ unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits:
#endif #endif
namespace Kokkos {
namespace Impl {
struct CudaLockArraysStruct {
int* atomic;
int* scratch;
int* threadid;
};
}
}
__device__ __constant__ __device__ __constant__
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
extern extern
#endif #endif
int* kokkos_impl_cuda_atomic_lock_array ; Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#define CUDA_SPACE_ATOMIC_MASK 0x1FFFF #define CUDA_SPACE_ATOMIC_MASK 0x1FFFF
#define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39 #define CUDA_SPACE_ATOMIC_XOR_MASK 0x15A39
namespace Kokkos {
namespace Impl {
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink = false);
}
}
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {
__device__ inline __device__ inline
@ -140,8 +156,7 @@ bool lock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; return (0 == atomicCAS(&kokkos_impl_cuda_lock_arrays.atomic[offset],0,1));
return (0 == atomicCAS(&kokkos_impl_cuda_atomic_lock_array[offset],0,1));
} }
__device__ inline __device__ inline
@ -149,8 +164,7 @@ void unlock_address_cuda_space(void* ptr) {
size_t offset = size_t(ptr); size_t offset = size_t(ptr);
offset = offset >> 2; offset = offset >> 2;
offset = offset & CUDA_SPACE_ATOMIC_MASK; offset = offset & CUDA_SPACE_ATOMIC_MASK;
//offset = offset xor CUDA_SPACE_ATOMIC_XOR_MASK; atomicExch( &kokkos_impl_cuda_lock_arrays.atomic[ offset ], 0);
atomicExch( &kokkos_impl_cuda_atomic_lock_array[ offset ], 0);
} }
} }
@ -232,8 +246,11 @@ struct CudaParallelLaunch< DriverType , true > {
cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) );
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr(); Kokkos::Impl::CudaLockArraysStruct locks;
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif #endif
// Invoke the driver function on the device // Invoke the driver function on the device
@ -271,8 +288,11 @@ struct CudaParallelLaunch< DriverType , false > {
#endif #endif
#ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE #ifndef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr(); Kokkos::Impl::CudaLockArraysStruct locks;
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif #endif
cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver ); cuda_parallel_launch_local_memory< DriverType ><<< grid , block , shmem , stream >>>( driver );

View File

@ -51,10 +51,10 @@
/* only compile this file if CUDA is enabled for Kokkos */ /* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA #ifdef KOKKOS_HAVE_CUDA
#include <Kokkos_Core.hpp>
#include <Kokkos_Cuda.hpp> #include <Kokkos_Cuda.hpp>
#include <Kokkos_CudaSpace.hpp> #include <Kokkos_CudaSpace.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Error.hpp>
@ -107,68 +107,6 @@ void DeepCopyAsyncCuda( void * dst , const void * src , size_t n) {
namespace Kokkos { namespace Kokkos {
#if ! KOKKOS_USING_EXP_VIEW
namespace {
void texture_object_attach_impl( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
enum { TEXTURE_BOUND_1D = 2u << 27 };
if ( tracker.attribute() == NULL ) {
// check for correct allocator
const bool ok_alloc = tracker.allocator()->support_texture_binding();
const bool ok_count = (tracker.alloc_size() / type_size) < TEXTURE_BOUND_1D;
if (ok_alloc && ok_count) {
Impl::TextureAttribute * attr = new Impl::TextureAttribute( tracker.alloc_ptr(), tracker.alloc_size(), desc );
tracker.set_attribute( attr );
}
else {
std::ostringstream oss;
oss << "Error: Cannot attach texture object";
if (!ok_alloc) {
oss << ", incompatabile allocator " << tracker.allocator()->name();
}
if (!ok_count) {
oss << ", array " << tracker.label() << " too large";
}
oss << ".";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
if ( NULL == dynamic_cast<Impl::TextureAttribute *>(tracker.attribute()) ) {
std::ostringstream oss;
oss << "Error: Allocation " << tracker.label() << " already has an attribute attached.";
Kokkos::Impl::throw_runtime_exception( oss.str() );
}
}
} // unnamed namespace
/*--------------------------------------------------------------------------*/
Impl::AllocationTracker CudaSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
void CudaSpace::access_error() void CudaSpace::access_error()
{ {
const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" ); const std::string msg("Kokkos::CudaSpace::access_error attempt to execute Cuda function from non-Cuda space" );
@ -183,23 +121,6 @@ void CudaSpace::access_error( const void * const )
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker CudaUVMSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
void CudaUVMSpace::texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
)
{
texture_object_attach_impl( tracker, type_size, desc );
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
bool CudaUVMSpace::available() bool CudaUVMSpace::available()
{ {
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__) #if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION ) && !defined(__APPLE__)
@ -212,15 +133,6 @@ bool CudaUVMSpace::available()
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker CudaHostPinnedSpace::allocate_and_track( const std::string & label, const size_t size )
{
return Impl::AllocationTracker( allocator(), size, label);
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
} // namespace Kokkos } // namespace Kokkos
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
@ -824,16 +736,26 @@ print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bo
namespace Kokkos { namespace Kokkos {
namespace { namespace {
__global__ void init_lock_array_kernel() { __global__ void init_lock_array_kernel_atomic() {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x; unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<CUDA_SPACE_ATOMIC_MASK+1) if(i<CUDA_SPACE_ATOMIC_MASK+1)
kokkos_impl_cuda_atomic_lock_array[i] = 0; kokkos_impl_cuda_lock_arrays.atomic[i] = 0;
}
__global__ void init_lock_array_kernel_scratch_threadid(int N) {
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if(i<N) {
kokkos_impl_cuda_lock_arrays.scratch[i] = 0;
kokkos_impl_cuda_lock_arrays.threadid[i] = 0;
}
} }
} }
namespace Impl { namespace Impl {
int* lock_array_cuda_space_ptr(bool deallocate) { int* atomic_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL; static int* ptr = NULL;
if(deallocate) { if(deallocate) {
cudaFree(ptr); cudaFree(ptr);
@ -845,13 +767,60 @@ int* lock_array_cuda_space_ptr(bool deallocate) {
return ptr; return ptr;
} }
void init_lock_array_cuda_space() { int* scratch_lock_array_cuda_space_ptr(bool deallocate) {
int is_initialized = 0; static int* ptr = NULL;
if(! is_initialized) { if(deallocate) {
int* lock_array_ptr = lock_array_cuda_space_ptr(); cudaFree(ptr);
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); ptr = NULL;
init_lock_array_kernel<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
} }
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
int* threadid_lock_array_cuda_space_ptr(bool deallocate) {
static int* ptr = NULL;
if(deallocate) {
cudaFree(ptr);
ptr = NULL;
}
if(ptr==NULL && !deallocate)
cudaMalloc(&ptr,sizeof(int)*(Cuda::concurrency()));
return ptr;
}
void init_lock_arrays_cuda_space() {
static int is_initialized = 0;
if(! is_initialized) {
Kokkos::Impl::CudaLockArraysStruct locks;
locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
init_lock_array_kernel_atomic<<<(CUDA_SPACE_ATOMIC_MASK+255)/256,256>>>();
init_lock_array_kernel_scratch_threadid<<<(Kokkos::Cuda::concurrency()+255)/256,256>>>(Kokkos::Cuda::concurrency());
}
}
void* cuda_resize_scratch_space(size_t bytes, bool force_shrink) {
static void* ptr = NULL;
static size_t current_size = 0;
if(current_size == 0) {
current_size = bytes;
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
}
if(bytes > current_size) {
current_size = bytes;
ptr = Kokkos::kokkos_realloc<Kokkos::CudaSpace>(ptr,current_size);
}
if((bytes < current_size) && (force_shrink)) {
current_size = bytes;
Kokkos::kokkos_free<Kokkos::CudaSpace>(ptr);
ptr = Kokkos::kokkos_malloc<Kokkos::CudaSpace>("CudaSpace::ScratchMemory",current_size);
}
return ptr;
} }
} }

View File

@ -50,7 +50,6 @@
#ifdef KOKKOS_HAVE_CUDA #ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {

View File

@ -1,198 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Error.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
#include <Cuda/Kokkos_Cuda_Error.hpp>
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
TextureAttribute::TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
)
: m_tex_obj(0)
{
cuda_device_synchronize();
struct cudaResourceDesc resDesc ;
struct cudaTextureDesc texDesc ;
memset( & resDesc , 0 , sizeof(resDesc) );
memset( & texDesc , 0 , sizeof(texDesc) );
resDesc.resType = cudaResourceTypeLinear ;
resDesc.res.linear.desc = desc ;
resDesc.res.linear.sizeInBytes = alloc_size ;
resDesc.res.linear.devPtr = alloc_ptr ;
CUDA_SAFE_CALL( cudaCreateTextureObject( & m_tex_obj , & resDesc, & texDesc, NULL) );
cuda_device_synchronize();
}
TextureAttribute::~TextureAttribute()
{
if (m_tex_obj) {
cudaDestroyTextureObject( m_tex_obj );
}
}
/*--------------------------------------------------------------------------*/
void * CudaMallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMalloc( &ptr, size ) );
return ptr;
}
void CudaMallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaMallocAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaUVMAllocator::allocate( size_t size )
{
#if defined( CUDA_VERSION ) && ( 6000 <= CUDA_VERSION )
void * ptr = NULL;
CUDA_SAFE_CALL( cudaMallocManaged( &ptr, size, cudaMemAttachGlobal ) );
return ptr;
#else
throw_runtime_exception( "CUDA VERSION does not support UVM" );
return NULL;
#endif
}
void CudaUVMAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFree( ptr ) );
} catch(...) {}
}
void * CudaUVMAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyDefault ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
void * CudaHostAllocator::allocate( size_t size )
{
void * ptr = NULL;
CUDA_SAFE_CALL( cudaHostAlloc( &ptr , size , cudaHostAllocDefault ) );
return ptr;
}
void CudaHostAllocator::deallocate( void * ptr, size_t /*size*/ )
{
try {
CUDA_SAFE_CALL( cudaFreeHost( ptr ) );
} catch(...) {}
}
void * CudaHostAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;
if (old_size != new_size) {
ptr = allocate( new_size );
size_t copy_size = old_size < new_size ? old_size : new_size;
CUDA_SAFE_CALL( cudaMemcpy( ptr , old_ptr , copy_size , cudaMemcpyHostToHost ) );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

View File

@ -1,190 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#define KOKKOS_CUDA_BASIC_ALLOCATORS_HPP
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
/* only compile this file if CUDA is enabled for Kokkos */
#ifdef KOKKOS_HAVE_CUDA
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_AllocationTracker.hpp> // AllocatorAttributeBase
namespace Kokkos { namespace Impl {
// Cuda 5.0 <texture_types.h> defines 'cudaTextureObject_t'
// to be an 'unsigned long long'. This chould change with
// future version of Cuda and this typedef would have to
// change accordingly.
#if defined( CUDA_VERSION ) && ( 5000 <= CUDA_VERSION )
typedef enable_if<
sizeof(::cudaTextureObject_t) == sizeof(const void *) ,
::cudaTextureObject_t >::type cuda_texture_object_type ;
#else
typedef const void * cuda_texture_object_type ;
#endif
struct TextureAttribute : public AllocatorAttributeBase
{
cuda_texture_object_type m_tex_obj ;
TextureAttribute( void * const alloc_ptr
, size_t alloc_size
, cudaChannelFormatDesc const & desc
);
~TextureAttribute();
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedAllocator
{
static const char * name()
{
return "Cuda Unmanaged Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedAllocator
/// does nothing when deallocate(ptr,size) is called
struct CudaUnmanagedUVMAllocator
{
static const char * name()
{
return "Cuda Unmanaged UVM Allocator";
}
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
static bool support_texture_binding() { return true; }
};
/// class CudaUnmanagedHostAllocator
/// does nothing when deallocate(ptr,size) is called
class CudaUnmanagedHostAllocator
{
public:
static const char * name()
{
return "Cuda Unmanaged Host Allocator";
}
// Unmanaged deallocate does nothing
static void deallocate(void * /*ptr*/, size_t /*size*/) {}
};
/// class CudaMallocAllocator
class CudaMallocAllocator
{
public:
static const char * name()
{
return "Cuda Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaUVMAllocator
class CudaUVMAllocator
{
public:
static const char * name()
{
return "Cuda UVM Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
static bool support_texture_binding() { return true; }
};
/// class CudaHostAllocator
class CudaHostAllocator
{
public:
static const char * name()
{
return "Cuda Host Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif //KOKKOS_HAVE_CUDA
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_CUDA_BASIC_ALLOCATORS_HPP

View File

@ -51,8 +51,8 @@
#include <Cuda/Kokkos_Cuda_Error.hpp> #include <Cuda/Kokkos_Cuda_Error.hpp>
#include <Cuda/Kokkos_Cuda_Internal.hpp> #include <Cuda/Kokkos_Cuda_Internal.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
/* Standard 'C' libraries */ /* Standard 'C' libraries */
@ -70,7 +70,7 @@ __device__ __constant__
unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ; unsigned long kokkos_impl_cuda_constant_memory_buffer[ Kokkos::Impl::CudaTraits::ConstantMemoryUsage / sizeof(unsigned long) ] ;
__device__ __constant__ __device__ __constant__
int* kokkos_impl_cuda_atomic_lock_array ; Kokkos::Impl::CudaLockArraysStruct kokkos_impl_cuda_lock_arrays ;
#endif #endif
@ -190,7 +190,7 @@ namespace {
class CudaInternalDevices { class CudaInternalDevices {
public: public:
enum { MAXIMUM_DEVICE_COUNT = 8 }; enum { MAXIMUM_DEVICE_COUNT = 64 };
struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ; struct cudaDeviceProp m_cudaProp[ MAXIMUM_DEVICE_COUNT ] ;
int m_cudaDevCount ; int m_cudaDevCount ;
@ -206,6 +206,9 @@ CudaInternalDevices::CudaInternalDevices()
CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) ); CUDA_SAFE_CALL (cudaGetDeviceCount( & m_cudaDevCount ) );
if(m_cudaDevCount > MAXIMUM_DEVICE_COUNT) {
Kokkos::abort("Sorry, you have more GPUs per node than we thought anybody would ever have. Please report this to github.com/kokkos/kokkos.");
}
for ( int i = 0 ; i < m_cudaDevCount ; ++i ) { for ( int i = 0 ; i < m_cudaDevCount ; ++i ) {
CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) ); CUDA_SAFE_CALL( cudaGetDeviceProperties( m_cudaProp + i , i ) );
} }
@ -226,14 +229,6 @@ private:
CudaInternal( const CudaInternal & ); CudaInternal( const CudaInternal & );
CudaInternal & operator = ( const CudaInternal & ); CudaInternal & operator = ( const CudaInternal & );
#if ! KOKKOS_USING_EXP_VIEW
AllocationTracker m_scratchFlagsTracker;
AllocationTracker m_scratchSpaceTracker;
AllocationTracker m_scratchUnifiedTracker;
#endif
public: public:
@ -255,6 +250,8 @@ public:
size_type * m_scratchUnified ; size_type * m_scratchUnified ;
cudaStream_t * m_stream ; cudaStream_t * m_stream ;
static int was_initialized;
static int was_finalized;
static CudaInternal & singleton(); static CudaInternal & singleton();
@ -293,6 +290,8 @@ public:
size_type * scratch_unified( const size_type size ); size_type * scratch_unified( const size_type size );
}; };
int CudaInternal::was_initialized = 0;
int CudaInternal::was_finalized = 0;
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -367,6 +366,10 @@ CudaInternal & CudaInternal::singleton()
void CudaInternal::initialize( int cuda_device_id , int stream_count ) void CudaInternal::initialize( int cuda_device_id , int stream_count )
{ {
if ( was_finalized ) Kokkos::abort("Calling Cuda::initialize after Cuda::finalize is illegal\n");
was_initialized = 1;
if ( is_initialized() ) return;
enum { WordSize = sizeof(size_type) }; enum { WordSize = sizeof(size_type) };
if ( ! HostSpace::execution_space::is_initialized() ) { if ( ! HostSpace::execution_space::is_initialized() ) {
@ -526,11 +529,14 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count )
cudaThreadSetCacheConfig(cudaFuncCachePreferShared); cudaThreadSetCacheConfig(cudaFuncCachePreferShared);
// Init the array for used for arbitrarily sized atomics // Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_cuda_space(); Impl::init_lock_arrays_cuda_space();
#ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE #ifdef KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE
int* lock_array_ptr = lock_array_cuda_space_ptr(); Kokkos::Impl::CudaLockArraysStruct locks;
cudaMemcpyToSymbol( kokkos_impl_cuda_atomic_lock_array , & lock_array_ptr , sizeof(int*) ); locks.atomic = atomic_lock_array_cuda_space_ptr(false);
locks.scratch = scratch_lock_array_cuda_space_ptr(false);
locks.threadid = threadid_lock_array_cuda_space_ptr(false);
cudaMemcpyToSymbol( kokkos_impl_cuda_lock_arrays , & locks , sizeof(CudaLockArraysStruct) );
#endif #endif
} }
@ -548,14 +554,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; m_scratchFlagsCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
m_scratchFlagsTracker = CudaSpace::allocate_and_track( std::string("InternalScratchFlags") , sizeof( ScratchGrain ) * m_scratchFlagsCount );
m_scratchFlags = reinterpret_cast<size_type *>(m_scratchFlagsTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace() Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -566,9 +564,6 @@ CudaInternal::scratch_flags( const Cuda::size_type size )
m_scratchFlags = reinterpret_cast<size_type *>( r->data() ); m_scratchFlags = reinterpret_cast<size_type *>( r->data() );
#endif
CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) ); CUDA_SAFE_CALL( cudaMemset( m_scratchFlags , 0 , m_scratchFlagsCount * sizeScratchGrain ) );
} }
@ -582,14 +577,6 @@ CudaInternal::scratch_space( const Cuda::size_type size )
m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; m_scratchSpaceCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
m_scratchSpaceTracker = CudaSpace::allocate_and_track( std::string("InternalScratchSpace") , sizeof( ScratchGrain ) * m_scratchSpaceCount );
m_scratchSpace = reinterpret_cast<size_type *>(m_scratchSpaceTracker.alloc_ptr());
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaSpace() Record * const r = Record::allocate( Kokkos::CudaSpace()
@ -599,9 +586,6 @@ CudaInternal::scratch_space( const Cuda::size_type size )
Record::increment( r ); Record::increment( r );
m_scratchSpace = reinterpret_cast<size_type *>( r->data() ); m_scratchSpace = reinterpret_cast<size_type *>( r->data() );
#endif
} }
return m_scratchSpace ; return m_scratchSpace ;
@ -615,14 +599,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ; m_scratchUnifiedCount = ( size + sizeScratchGrain - 1 ) / sizeScratchGrain ;
#if ! KOKKOS_USING_EXP_VIEW
m_scratchUnifiedTracker = CudaHostPinnedSpace::allocate_and_track( std::string("InternalScratchUnified") , sizeof( ScratchGrain ) * m_scratchUnifiedCount );
m_scratchUnified = reinterpret_cast<size_type *>( m_scratchUnifiedTracker.alloc_ptr() );
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace() Record * const r = Record::allocate( Kokkos::CudaHostPinnedSpace()
@ -632,9 +608,6 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
Record::increment( r ); Record::increment( r );
m_scratchUnified = reinterpret_cast<size_type *>( r->data() ); m_scratchUnified = reinterpret_cast<size_type *>( r->data() );
#endif
} }
return m_scratchUnified ; return m_scratchUnified ;
@ -644,9 +617,13 @@ CudaInternal::scratch_unified( const Cuda::size_type size )
void CudaInternal::finalize() void CudaInternal::finalize()
{ {
was_finalized = 1;
if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) { if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) {
lock_array_cuda_space_ptr(true); atomic_lock_array_cuda_space_ptr(false);
scratch_lock_array_cuda_space_ptr(false);
threadid_lock_array_cuda_space_ptr(false);
if ( m_stream ) { if ( m_stream ) {
for ( size_type i = 1 ; i < m_streamCount ; ++i ) { for ( size_type i = 1 ; i < m_streamCount ; ++i ) {
cudaStreamDestroy( m_stream[i] ); cudaStreamDestroy( m_stream[i] );
@ -655,14 +632,6 @@ void CudaInternal::finalize()
::free( m_stream ); ::free( m_stream );
} }
#if ! KOKKOS_USING_EXP_VIEW
m_scratchSpaceTracker.clear();
m_scratchFlagsTracker.clear();
m_scratchUnifiedTracker.clear();
#else
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaSpace > RecordCuda ;
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< CudaHostPinnedSpace > RecordHost ;
@ -670,8 +639,6 @@ void CudaInternal::finalize()
RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) ); RecordCuda::decrement( RecordCuda::get_record( m_scratchSpace ) );
RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) ); RecordHost::decrement( RecordHost::get_record( m_scratchUnified ) );
#endif
m_cudaDev = -1 ; m_cudaDev = -1 ;
m_multiProcCount = 0 ; m_multiProcCount = 0 ;
m_maxWarpCount = 0 ; m_maxWarpCount = 0 ;
@ -730,7 +697,13 @@ int Cuda::is_initialized()
{ return Impl::CudaInternal::singleton().is_initialized(); } { return Impl::CudaInternal::singleton().is_initialized(); }
void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances ) void Cuda::initialize( const Cuda::SelectDevice config , size_t num_instances )
{ Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances ); } {
Impl::CudaInternal::singleton().initialize( config.cuda_device_id , num_instances );
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
}
std::vector<unsigned> std::vector<unsigned>
Cuda::detect_device_arch() Cuda::detect_device_arch()
@ -763,7 +736,13 @@ Cuda::size_type Cuda::device_arch()
} }
void Cuda::finalize() void Cuda::finalize()
{ Impl::CudaInternal::singleton().finalize(); } {
Impl::CudaInternal::singleton().finalize();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
Cuda::Cuda() Cuda::Cuda()
: m_device( Impl::CudaInternal::singleton().m_cudaDev ) : m_device( Impl::CudaInternal::singleton().m_cudaDev )

View File

@ -57,17 +57,20 @@ template<class DriverType, bool Large>
struct CudaGetMaxBlockSize; struct CudaGetMaxBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))> template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { int cuda_get_max_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra); const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetMaxBlockSize<DriverType,Large>::get_block_size(f,vector_length, shmem_extra_block,shmem_extra_thread);
} }
template<class DriverType> template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,true> { struct CudaGetMaxBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks; int numBlocks;
int blockSize=32; int blockSize=32;
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,
cuda_parallel_launch_constant_memory<DriverType>, cuda_parallel_launch_constant_memory<DriverType>,
@ -76,7 +79,8 @@ struct CudaGetMaxBlockSize<DriverType,true> {
while (blockSize<1024 && numBlocks>0) { while (blockSize<1024 && numBlocks>0) {
blockSize*=2; blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length); sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,
@ -91,11 +95,13 @@ struct CudaGetMaxBlockSize<DriverType,true> {
template<class DriverType> template<class DriverType>
struct CudaGetMaxBlockSize<DriverType,false> { struct CudaGetMaxBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int numBlocks; int numBlocks;
int blockSize=32; int blockSize=32;
int sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); int sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,
cuda_parallel_launch_local_memory<DriverType>, cuda_parallel_launch_local_memory<DriverType>,
@ -104,7 +110,8 @@ struct CudaGetMaxBlockSize<DriverType,false> {
while (blockSize<1024 && numBlocks>0) { while (blockSize<1024 && numBlocks>0) {
blockSize*=2; blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,
@ -123,13 +130,15 @@ template<class DriverType, bool Large>
struct CudaGetOptBlockSize; struct CudaGetOptBlockSize;
template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))> template<class DriverType, bool Large = (CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType))>
int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { int cuda_get_opt_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra); const size_t shmem_extra_block, const size_t shmem_extra_thread) {
return CudaGetOptBlockSize<DriverType,Large>::get_block_size(f,vector_length,shmem_extra_block,shmem_extra_thread);
} }
template<class DriverType> template<class DriverType>
struct CudaGetOptBlockSize<DriverType,true> { struct CudaGetOptBlockSize<DriverType,true> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16; int blockSize=16;
int numBlocks; int numBlocks;
int sharedmem; int sharedmem;
@ -140,7 +149,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
blockSize*=2; blockSize*=2;
//calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far //calculate the occupancy with that optBlockSize and check whether its larger than the largest one found so far
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,
cuda_parallel_launch_constant_memory<DriverType>, cuda_parallel_launch_constant_memory<DriverType>,
@ -157,7 +167,8 @@ struct CudaGetOptBlockSize<DriverType,true> {
template<class DriverType> template<class DriverType>
struct CudaGetOptBlockSize<DriverType,false> { struct CudaGetOptBlockSize<DriverType,false> {
static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length, const size_t shmem_extra) { static int get_block_size(const typename DriverType::functor_type & f, const size_t vector_length,
const size_t shmem_extra_block, const size_t shmem_extra_thread) {
int blockSize=16; int blockSize=16;
int numBlocks; int numBlocks;
int sharedmem; int sharedmem;
@ -166,7 +177,8 @@ struct CudaGetOptBlockSize<DriverType,false> {
while(blockSize<1024) { while(blockSize<1024) {
blockSize*=2; blockSize*=2;
sharedmem = shmem_extra + FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length ); sharedmem = shmem_extra_block + shmem_extra_thread*(blockSize/vector_length) +
FunctorTeamShmemSize< typename DriverType::functor_type >::value( f , blockSize/vector_length );
cudaOccupancyMaxActiveBlocksPerMultiprocessor( cudaOccupancyMaxActiveBlocksPerMultiprocessor(
&numBlocks, &numBlocks,

File diff suppressed because it is too large Load Diff

View File

@ -130,16 +130,17 @@ inline void cuda_intra_block_reduction( ValueType& value,
cuda_inter_warp_reduction(value,join,max_active_thread); cuda_inter_warp_reduction(value,join,max_active_thread);
} }
template< class FunctorType , class JoinOp> template< class FunctorType , class JoinOp , class ArgTag = void >
__device__ __device__
bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void >::reference_type value, bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgTag >::reference_type value,
typename FunctorValueTraits< FunctorType , ArgTag >::reference_type neutral,
const JoinOp& join, const JoinOp& join,
Cuda::size_type * const m_scratch_space, Cuda::size_type * const m_scratch_space,
typename FunctorValueTraits< FunctorType , void >::pointer_type const result, typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type const result,
Cuda::size_type * const m_scratch_flags, Cuda::size_type * const m_scratch_flags,
const int max_active_thread = blockDim.y) { const int max_active_thread = blockDim.y) {
typedef typename FunctorValueTraits< FunctorType , void >::pointer_type pointer_type; typedef typename FunctorValueTraits< FunctorType , ArgTag >::pointer_type pointer_type;
typedef typename FunctorValueTraits< FunctorType , void >::value_type value_type; typedef typename FunctorValueTraits< FunctorType , ArgTag >::value_type value_type;
//Do the intra-block reduction with shfl operations and static shared memory //Do the intra-block reduction with shfl operations and static shared memory
cuda_intra_block_reduction(value,join,max_active_thread); cuda_intra_block_reduction(value,join,max_active_thread);
@ -170,7 +171,7 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , void
if(id == 0) if(id == 0)
*m_scratch_flags = 0; *m_scratch_flags = 0;
last_block = true; last_block = true;
value = 0; value = neutral;
pointer_type const volatile global = (pointer_type) m_scratch_space ; pointer_type const volatile global = (pointer_type) m_scratch_space ;
@ -366,7 +367,12 @@ bool cuda_single_inter_block_reduce_scan( const FunctorType & functor ,
size_type * const shared = shared_data + word_count.value * BlockSizeMask ; size_type * const shared = shared_data + word_count.value * BlockSizeMask ;
size_type * const global = global_data + word_count.value * block_id ; size_type * const global = global_data + word_count.value * block_id ;
#if (__CUDA_ARCH__ < 500)
for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; } for ( size_type i = threadIdx.y ; i < word_count.value ; i += blockDim.y ) { global[i] = shared[i] ; }
#else
for ( size_type i = 0 ; i < word_count.value ; i += 1 ) { global[i] = shared[i] ; }
#endif
} }
// Contributing blocks note that their contribution has been completed via an atomic-increment flag // Contributing blocks note that their contribution has been completed via an atomic-increment flag

View File

@ -0,0 +1,179 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
__device__
void TaskQueueSpecialization< Kokkos::Cuda >::driver
( TaskQueueSpecialization< Kokkos::Cuda >::queue_type * const queue )
{
using Member = TaskExec< Kokkos::Cuda > ;
using Queue = TaskQueue< Kokkos::Cuda > ;
using task_root_type = TaskBase< Kokkos::Cuda , void , void > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec( 1 );
Member team_exec( blockDim.y );
const int warp_lane = threadIdx.x + threadIdx.y * blockDim.x ;
union {
task_root_type * ptr ;
int raw[2] ;
} task ;
// Loop until all queues are empty and no tasks in flight
do {
// Each team lead attempts to acquire either a thread team task
// or collection of single thread tasks for the team.
if ( 0 == warp_lane ) {
task.ptr = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < Queue::NumQueue && end == task.ptr ; ++i ) {
for ( int j = 0 ; j < 2 && end == task.ptr ; ++j ) {
task.ptr = Queue::pop_task( & queue->m_ready[i][j] );
}
}
#if 0
printf("TaskQueue<Cuda>::driver(%d,%d) task(%lx)\n",threadIdx.z,blockIdx.x
, uintptr_t(task.ptr));
#endif
}
// shuffle broadcast
task.raw[0] = __shfl( task.raw[0] , 0 );
task.raw[1] = __shfl( task.raw[1] , 0 );
if ( 0 == task.ptr ) break ; // 0 == queue->m_ready_count
if ( end != task.ptr ) {
if ( task_root_type::TaskTeam == task.ptr->m_task_type ) {
// Thread Team Task
(*task.ptr->m_apply)( task.ptr , & team_exec );
}
else if ( 0 == threadIdx.y ) {
// Single Thread Task
(*task.ptr->m_apply)( task.ptr , & single_exec );
}
if ( 0 == warp_lane ) {
queue->complete( task.ptr );
}
}
} while(1);
}
namespace {
__global__
void cuda_task_queue_execute( TaskQueue< Kokkos::Cuda > * queue )
{ TaskQueueSpecialization< Kokkos::Cuda >::driver( queue ); }
}
void TaskQueueSpecialization< Kokkos::Cuda >::execute
( TaskQueue< Kokkos::Cuda > * const queue )
{
const int warps_per_block = 4 ;
const dim3 grid( Kokkos::Impl::cuda_internal_multiprocessor_count() , 1 , 1 );
const dim3 block( 1 , Kokkos::Impl::CudaTraits::WarpSize , warps_per_block );
const int shared = 0 ;
const cudaStream_t stream = 0 ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#if 0
printf("cuda_task_queue_execute before\n");
#endif
// Query the stack size, in bytes:
//
// size_t stack_size = 0 ;
// CUDA_SAFE_CALL( cudaDeviceGetLimit( & stack_size , cudaLimitStackSize ) );
//
// If not large enough then set the stack size, in bytes:
//
// CUDA_SAFE_CALL( cudaDeviceSetLimit( cudaLimitStackSize , stack_size ) );
cuda_task_queue_execute<<< grid , block , shared , stream >>>( queue );
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
#if 0
printf("cuda_task_queue_execute after\n");
#endif
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,519 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_CUDA_TASK_HPP
#define KOKKOS_IMPL_CUDA_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
namespace {
template< typename TaskType >
__global__
void set_cuda_task_base_apply_function_pointer
( TaskBase<Kokkos::Cuda,void,void>::function_type * ptr )
{ *ptr = TaskType::apply ; }
}
template<>
class TaskQueueSpecialization< Kokkos::Cuda >
{
public:
using execution_space = Kokkos::Cuda ;
using memory_space = Kokkos::CudaUVMSpace ;
using queue_type = TaskQueue< execution_space > ;
static
void iff_single_thread_recursive_execute( queue_type * const ) {}
__device__
static void driver( queue_type * const );
static
void execute( queue_type * const );
template< typename FunctorType >
static
void proc_set_apply( TaskBase<execution_space,void,void>::function_type * ptr )
{
using TaskType = TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
set_cuda_task_base_apply_function_pointer<TaskType><<<1,1>>>(ptr);
CUDA_SAFE_CALL( cudaGetLastError() );
CUDA_SAFE_CALL( cudaDeviceSynchronize() );
}
};
extern template class TaskQueue< Kokkos::Cuda > ;
//----------------------------------------------------------------------------
/**\brief Impl::TaskExec<Cuda> is the TaskPolicy<Cuda>::member_type
* passed to tasks running in a Cuda space.
*
* Cuda thread blocks for tasking are dimensioned:
* blockDim.x == vector length
* blockDim.y == team size
* blockDim.z == number of teams
* where
* blockDim.x * blockDim.y == WarpSize
*
* Both single thread and thread team tasks are run by a full Cuda warp.
* A single thread task is called by warp lane #0 and the remaining
* lanes of the warp are idle.
*/
template<>
class TaskExec< Kokkos::Cuda >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
friend class Kokkos::Impl::TaskQueue< Kokkos::Cuda > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::Cuda > ;
const int m_team_size ;
__device__
TaskExec( int arg_team_size = blockDim.y )
: m_team_size( arg_team_size ) {}
public:
#if defined( __CUDA_ARCH__ )
__device__ void team_barrier() { /* __threadfence_block(); */ }
__device__ int team_rank() const { return threadIdx.y ; }
__device__ int team_size() const { return m_team_size ; }
#else
__host__ void team_barrier() {}
__host__ int team_rank() const { return 0 ; }
__host__ int team_size() const { return 0 ; }
#endif
};
//----------------------------------------------------------------------------
template<typename iType>
struct TeamThreadRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
{
typedef iType index_type;
const iType start ;
const iType end ;
const iType increment ;
const TaskExec< Kokkos::Cuda > & thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
: start( threadIdx.y )
, end(arg_count)
, increment( blockDim.y )
, thread(arg_thread)
{}
__device__ inline
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
, const iType & arg_start
, const iType & arg_end
)
: start( arg_start + threadIdx.y )
, end( arg_end)
, increment( blockDim.y )
, thread( arg_thread )
{}
#else
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
TeamThreadRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread
, const iType & arg_start
, const iType & arg_end
);
#endif
};
//----------------------------------------------------------------------------
template<typename iType>
struct ThreadVectorRangeBoundariesStruct<iType, TaskExec< Kokkos::Cuda > >
{
typedef iType index_type;
const iType start ;
const iType end ;
const iType increment ;
const TaskExec< Kokkos::Cuda > & thread;
#if defined( __CUDA_ARCH__ )
__device__ inline
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count)
: start( threadIdx.x )
, end(arg_count)
, increment( blockDim.x )
, thread(arg_thread)
{}
#else
ThreadVectorRangeBoundariesStruct
( const TaskExec< Kokkos::Cuda > & arg_thread, const iType& arg_count);
#endif
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
TeamThreadRange( const Impl::TaskExec< Kokkos::Cuda > & thread, const iType & start , const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >(thread,start,end);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >
ThreadVectorRange( const Impl::TaskExec< Kokkos::Cuda > & thread
, const iType & count )
{
return Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >(thread,count);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::Cuda > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
// reduce across corresponding lanes between team members within warp
// assume stride*team_size == warp_size
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void strided_shfl_warp_reduction
(const JoinType& join,
ValueType& val,
int team_size,
int stride)
{
for (int lane_delta=(team_size*stride)>>1; lane_delta>=stride; lane_delta>>=1) {
join(val, Kokkos::shfl_down(val, lane_delta, team_size*stride));
}
}
// multiple within-warp non-strided reductions
template< typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void multi_shfl_warp_reduction
(const JoinType& join,
ValueType& val,
int vec_length)
{
for (int lane_delta=vec_length>>1; lane_delta; lane_delta>>=1) {
join(val, Kokkos::shfl_down(val, lane_delta, vec_length));
}
}
// broadcast within warp
template< class ValueType >
KOKKOS_INLINE_FUNCTION
ValueType shfl_warp_broadcast
(ValueType& val,
int src_lane,
int width)
{
return Kokkos::shfl(val, src_lane, width);
}
// all-reduce across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const JoinType& join,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
strided_shfl_warp_reduction<ValueType, JoinType>(
join,
initialized_result,
loop_boundaries.thread.team_size(),
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
// all-reduce across corresponding vector lanes between team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
//TODO what is the point of creating this temporary?
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
strided_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
loop_boundaries.thread.team_size(),
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, threadIdx.x, Impl::CudaTraits::WarpSize );
}
// all-reduce within team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
const JoinType& join,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
multi_shfl_warp_reduction<ValueType, JoinType>(join, initialized_result, blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
// all-reduce within team members within warp
// if no join() provided, use sum
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result) {
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i,result);
}
initialized_result = result;
//initialized_result = multi_shfl_warp_reduction(
multi_shfl_warp_reduction(
[&] (ValueType& val1, const ValueType& val2) { val1 += val2; },
initialized_result,
blockDim.x);
initialized_result = shfl_warp_broadcast<ValueType>( initialized_result, 0, blockDim.x );
}
// scan across corresponding vector lanes between team members within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda) {
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-blockDim.y exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = blockDim.x ; offset < Impl::CudaTraits::WarpSize ; offset <<= 1 ) {
y = Kokkos::shfl_up(val, offset, Impl::CudaTraits::WarpSize);
if(threadIdx.y*blockDim.x >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val,
threadIdx.x+Impl::CudaTraits::WarpSize-blockDim.x,
Impl::CudaTraits::WarpSize);
// make EXCLUSIVE scan by shifting values over one
val = Kokkos::shfl_up(val, blockDim.x, Impl::CudaTraits::WarpSize);
if ( threadIdx.y == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
// scan within team member (vector) within warp
// assume vec_length*team_size == warp_size
// blockDim.x == vec_length == stride
// blockDim.y == team_size
// threadIdx.x == position in vec
// threadIdx.y == member number
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::Cuda > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, y, local_total;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
val = 0;
lambda(i,val,false);
// intra-blockDim.x exclusive scan on 'val'
// accum = accumulated, sum in total for this iteration
// INCLUSIVE scan
for( int offset = 1 ; offset < blockDim.x ; offset <<= 1 ) {
y = Kokkos::shfl_up(val, offset, blockDim.x);
if(threadIdx.x >= offset) { val += y; }
}
// pass accum to all threads
local_total = shfl_warp_broadcast<ValueType>(val, blockDim.x-1, blockDim.x);
// make EXCLUSIVE scan by shifting values over one
val = Kokkos::shfl_up(val, 1, blockDim.x);
if ( threadIdx.x == 0 ) { val = 0 ; }
val += accum;
lambda(i,val,true);
accum += local_total;
}
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */

View File

@ -46,9 +46,10 @@
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <Kokkos_Core.hpp>
#include <Cuda/Kokkos_Cuda_TaskPolicy.hpp> #include <Cuda/Kokkos_Cuda_TaskPolicy.hpp>
#if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
// #define DETAILED_PRINT // #define DETAILED_PRINT
@ -93,9 +94,8 @@ CudaTaskPolicyQueue
, const unsigned arg_team_size , const unsigned arg_team_size
) )
: m_space( Kokkos::CudaUVMSpace() : m_space( Kokkos::CudaUVMSpace()
, arg_task_max_size , arg_task_max_size * arg_task_max_count * 1.2
, arg_task_max_size * arg_task_max_count , 16 /* log2(superblock size) */
, 1 /* only one level of memory pool */
) )
, m_team { 0 , 0 , 0 } , m_team { 0 , 0 , 0 }
, m_serial { 0 , 0 , 0 } , m_serial { 0 , 0 , 0 }
@ -172,6 +172,8 @@ if ( IS_TEAM_LEAD && 0 != team_task ) {
member( kokkos_impl_cuda_shared_memory<void>() member( kokkos_impl_cuda_shared_memory<void>()
, 16 /* shared_begin */ , 16 /* shared_begin */
, team_task->m_shmem_size /* shared size */ , team_task->m_shmem_size /* shared size */
, 0 /* scratch level 1 pointer */
, 0 /* scratch level 1 size */
, 0 /* league rank */ , 0 /* league rank */
, 1 /* league size */ , 1 /* league size */
); );
@ -926,5 +928,5 @@ void Task::clear_dependence()
} /* namespace Kokkos */ } /* namespace Kokkos */
#endif /* #if defined( KOKKOS_ENABLE_CUDA_TASK_POLICY ) */ #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -47,19 +47,11 @@
#define KOKKOS_CUDA_TASKPOLICY_HPP #define KOKKOS_CUDA_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp> #include <Kokkos_Core_fwd.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && \
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_CUDA_TASK_POLICY
/* The TaskPolicy< Cuda > capability requires nvcc using the option:
* --relocatable-device-code=true
*/
#include <Kokkos_Cuda.hpp> #include <Kokkos_Cuda.hpp>
#include <Kokkos_TaskPolicy.hpp> #include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
namespace Kokkos { namespace Kokkos {
@ -81,8 +73,6 @@ public:
private: private:
friend struct CudaTaskPolicyQueue ;
CudaTaskPolicyQueue * m_policy ; CudaTaskPolicyQueue * m_policy ;
TaskMember * volatile * m_queue ; TaskMember * volatile * m_queue ;
function_team_type m_team ; ///< Apply function on CUDA function_team_type m_team ; ///< Apply function on CUDA
@ -819,9 +809,11 @@ public:
static member_type member_single() static member_type member_single()
{ {
return return
member_type( 0 /* shared memory */ member_type( 0 /* shared memory pointer */
, 0 /* shared memory begin */ , 0 /* shared memory begin offset */
, 0 /* shared memory size */ , 0 /* shared memory end offset */
, 0 /* scratch level_1 pointer */
, 0 /* scratch level_1 size */
, 0 /* league rank */ , 0 /* league rank */
, 1 /* league size */ ); , 1 /* league size */ );
} }
@ -832,10 +824,10 @@ public:
} /* namespace Experimental */ } /* namespace Experimental */
} /* namespace Kokkos */ } /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE ) */
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */ #endif /* #ifndef KOKKOS_CUDA_TASKPOLICY_HPP */

View File

@ -56,8 +56,6 @@
#include <impl/Kokkos_Shape.hpp> #include <impl/Kokkos_Shape.hpp>
#include <Kokkos_View.hpp> #include <Kokkos_View.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -90,343 +88,6 @@ struct AssertShapeBoundsAbort< CudaSpace >
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#if ! KOKKOS_USING_EXP_VIEW
namespace Kokkos {
namespace Impl {
//----------------------------------------------------------------------------
// Cuda Texture fetches can be performed for 4, 8 and 16 byte objects (int,int2,int4)
// Via reinterpret_case this can be used to support all scalar types of those sizes.
// Any other scalar type falls back to either normal reads out of global memory,
// or using the __ldg intrinsic on Kepler GPUs or newer (Compute Capability >= 3.0)
template< typename ValueType
, class MemorySpace
, class AliasType =
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 4 ) , int ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 8 ) , ::int2 ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 16 ) , ::int4 ,
typename Kokkos::Impl::if_c< ( sizeof(ValueType) == 32 ) , ::float4 ,void
>::type
>::type
>::type
>::type
>
class CudaTextureFetch {
private:
cuda_texture_object_type m_obj ;
const ValueType * m_alloc_ptr ;
int m_offset ;
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
{
typedef char const * const byte;
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
const size_t count = tracker.alloc_size() / sizeof(ValueType);
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
if (ok_aligned && ok_contains) {
if (tracker.attribute() == NULL ) {
MemorySpace::texture_object_attach(
tracker
, sizeof(ValueType)
, cudaCreateChannelDesc< AliasType >()
);
}
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
m_offset = arg_ptr - m_alloc_ptr;
}
else if( !ok_contains ) {
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
}
else {
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
}
}
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_alloc_ptr( rhs.m_alloc_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_alloc_ptr = rhs.m_alloc_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION explicit
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
{
#if defined( KOKKOS_USE_LDG_INTRINSIC )
m_alloc_ptr(arg_ptr);
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
if ( arg_ptr != NULL ) {
if ( tracker.is_valid() ) {
attach( arg_ptr, tracker );
}
else {
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
if ( found_tracker.is_valid() ) {
attach( arg_ptr, found_tracker );
} else {
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
}
}
}
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = tex1Dfetch<AliasType>( m_obj , i + m_offset );
return *(reinterpret_cast<ValueType*> (&v));
#else
return m_alloc_ptr[ i + m_offset ];
#endif
}
};
template< typename ValueType, class MemorySpace >
class CudaTextureFetch< const ValueType, MemorySpace, float4 > {
private:
typedef float4 AliasType;
cuda_texture_object_type m_obj ;
const ValueType * m_alloc_ptr ;
int m_offset ;
void attach( const ValueType * const arg_ptr, AllocationTracker const & tracker )
{
typedef char const * const byte;
m_alloc_ptr = reinterpret_cast<ValueType *>(tracker.alloc_ptr());
size_t byte_offset = reinterpret_cast<byte>(arg_ptr) - reinterpret_cast<byte>(m_alloc_ptr);
const bool ok_aligned = 0 == byte_offset % sizeof(ValueType);
const size_t count = tracker.alloc_size() / sizeof(ValueType);
const bool ok_contains = (m_alloc_ptr <= arg_ptr) && (arg_ptr < (m_alloc_ptr + count));
if (ok_aligned && ok_contains) {
if (tracker.attribute() == NULL ) {
MemorySpace::texture_object_attach(
tracker
, sizeof(ValueType)
, cudaCreateChannelDesc< AliasType >()
);
}
m_obj = dynamic_cast<TextureAttribute*>(tracker.attribute())->m_tex_obj;
m_offset = arg_ptr - m_alloc_ptr;
}
else if( !ok_contains ) {
throw_runtime_exception("Error: cannot attach a texture object to a tracker which does not bound the pointer.");
}
else {
throw_runtime_exception("Error: cannot attach a texture object to an incorrectly aligned pointer.");
}
}
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_obj() , m_alloc_ptr() , m_offset() {}
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs )
: m_obj( rhs.m_obj )
, m_alloc_ptr( rhs.m_alloc_ptr )
, m_offset( rhs.m_offset )
{}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs )
{
m_obj = rhs.m_obj ;
m_alloc_ptr = rhs.m_alloc_ptr ;
m_offset = rhs.m_offset ;
return *this ;
}
KOKKOS_INLINE_FUNCTION explicit
CudaTextureFetch( const ValueType * const arg_ptr, AllocationTracker const & tracker )
: m_obj( 0 ) , m_alloc_ptr(0) , m_offset(0)
{
#if defined( KOKKOS_USE_LDG_INTRINSIC )
m_alloc_ptr(arg_ptr);
#elif defined( __CUDACC__ ) && ! defined( __CUDA_ARCH__ )
if ( arg_ptr != NULL ) {
if ( tracker.is_valid() ) {
attach( arg_ptr, tracker );
}
else {
AllocationTracker found_tracker = AllocationTracker::find<typename MemorySpace::allocator>(arg_ptr);
if ( found_tracker.is_valid() ) {
attach( arg_ptr, found_tracker );
} else {
throw_runtime_exception("Error: cannot attach a texture object to an untracked pointer!");
}
}
}
#endif
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_alloc_ptr + m_offset ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
#if defined( KOKKOS_USE_LDG_INTRINSIC ) && defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
AliasType v = __ldg(reinterpret_cast<AliasType*>(&m_alloc_ptr[i]));
return *(reinterpret_cast<ValueType*> (&v));
#elif defined( __CUDA_ARCH__ ) && ( 300 <= __CUDA_ARCH__ )
union Float4ValueType {
float4 f4[2];
ValueType val;
};
Float4ValueType convert;
convert.f4[0] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset) );
convert.f4[1] = tex1Dfetch<AliasType>( m_obj , 2*(i + m_offset)+1 );
return convert.val;
#else
return m_alloc_ptr[ i + m_offset ];
#endif
}
};
template< typename ValueType, class MemorySpace >
class CudaTextureFetch< const ValueType, MemorySpace, void >
{
private:
const ValueType * m_ptr ;
public:
KOKKOS_INLINE_FUNCTION
CudaTextureFetch() : m_ptr(0) {};
KOKKOS_INLINE_FUNCTION
~CudaTextureFetch() {
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const ValueType * ptr, const AllocationTracker & ) : m_ptr(ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch( const CudaTextureFetch & rhs ) : m_ptr(rhs.m_ptr) {}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = ( const CudaTextureFetch & rhs ) {
m_ptr = rhs.m_ptr;
return *this ;
}
explicit KOKKOS_INLINE_FUNCTION
CudaTextureFetch( ValueType * const base_view_ptr, AllocationTracker const & /*tracker*/ ) {
m_ptr = base_view_ptr;
}
KOKKOS_INLINE_FUNCTION
CudaTextureFetch & operator = (const ValueType* base_view_ptr) {
m_ptr = base_view_ptr;
return *this;
}
KOKKOS_INLINE_FUNCTION
operator const ValueType * () const { return m_ptr ; }
template< typename iType >
KOKKOS_INLINE_FUNCTION
ValueType operator[]( const iType & i ) const
{
return m_ptr[ i ];
}
};
} // namespace Impl
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/** \brief Replace Default ViewDataHandle with Cuda texture fetch specialization
* if 'const' value type, CudaSpace and random access.
*/
template< class ViewTraits >
class ViewDataHandle< ViewTraits ,
typename enable_if< ( is_same< typename ViewTraits::memory_space,CudaSpace>::value ||
is_same< typename ViewTraits::memory_space,CudaUVMSpace>::value )
&&
is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value
&&
ViewTraits::memory_traits::RandomAccess
>::type >
{
public:
enum { ReturnTypeIsReference = false };
typedef Impl::CudaTextureFetch< typename ViewTraits::value_type
, typename ViewTraits::memory_space> handle_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & arg_tracker )
{
return handle_type(arg_data_ptr, arg_tracker);
}
typedef typename ViewTraits::value_type return_type;
};
}
}
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif // KOKKOS_HAVE_CUDA #endif // KOKKOS_HAVE_CUDA
#endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */ #endif /* #ifndef KOKKOS_CUDA_VIEW_HPP */

View File

@ -0,0 +1,611 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#define KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP
#include <Kokkos_ExecPolicy.hpp>
#include <Kokkos_Parallel.hpp>
#include <initializer_list>
#if defined(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION) && defined(KOKKOS_HAVE_PRAGMA_IVDEP) && !defined(__CUDA_ARCH__)
#define KOKKOS_MDRANGE_IVDEP
#endif
namespace Kokkos { namespace Experimental {
enum class Iterate
{
Default, // Default for the device
Left, // Left indices stride fastest
Right, // Right indices stride fastest
Flat, // Do not tile, only valid for inner direction
};
template <typename ExecSpace>
struct default_outer_direction
{
using type = Iterate;
static constexpr Iterate value = Iterate::Right;
};
template <typename ExecSpace>
struct default_inner_direction
{
using type = Iterate;
static constexpr Iterate value = Iterate::Right;
};
// Iteration Pattern
template < unsigned N
, Iterate OuterDir = Iterate::Default
, Iterate InnerDir = Iterate::Default
>
struct Rank
{
static_assert( N != 0u, "Kokkos Error: rank 0 undefined");
static_assert( N != 1u, "Kokkos Error: rank 1 is not a multi-dimensional range");
static_assert( N < 4u, "Kokkos Error: Unsupported rank...");
using iteration_pattern = Rank<N, OuterDir, InnerDir>;
static constexpr int rank = N;
static constexpr Iterate outer_direction = OuterDir;
static constexpr Iterate inner_direction = InnerDir;
};
// multi-dimensional iteration pattern
template <typename... Properties>
struct MDRangePolicy
{
using range_policy = RangePolicy<Properties...>;
static_assert( !std::is_same<range_policy,void>::value
, "Kokkos Error: MD iteration pattern not defined" );
using iteration_pattern = typename range_policy::iteration_pattern;
using work_tag = typename range_policy::work_tag;
static constexpr int rank = iteration_pattern::rank;
static constexpr int outer_direction = static_cast<int> (
(iteration_pattern::outer_direction != Iterate::Default && iteration_pattern::outer_direction != Iterate::Flat)
? iteration_pattern::outer_direction
: default_outer_direction< typename range_policy::execution_space>::value );
static constexpr int inner_direction = static_cast<int> (
iteration_pattern::inner_direction != Iterate::Default
? iteration_pattern::inner_direction
: default_inner_direction< typename range_policy::execution_space>::value ) ;
// Ugly ugly workaround intel 14 not handling scoped enum correctly
static constexpr int Flat = static_cast<int>( Iterate::Flat );
static constexpr int Right = static_cast<int>( Iterate::Right );
using size_type = typename range_policy::index_type;
using index_type = typename std::make_signed<size_type>::type;
template <typename I>
MDRangePolicy( std::initializer_list<I> upper_corner )
{
static_assert( std::is_integral<I>::value, "Kokkos Error: corner defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( upper_corner.size() == rank, "Kokkos Error: upper_corner has incorrect rank" );
const auto u = upper_corner.begin();
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(0);
m_dim[i] = static_cast<index_type>(u[i]);
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
if (inner_direction != Flat) {
// default tile size to 4
m_tile[i] = 4;
} else {
m_tile[i] = 1;
}
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
template <typename IA, typename IB, typename T>
MDRangePolicy( std::initializer_list<IA> corner_a
, std::initializer_list<IB> corner_b
, std::initializer_list<T> tile
)
{
static_assert( std::is_integral<IA>::value, "Kokkos Error: corner A defined with non-integral type" );
static_assert( std::is_integral<IB>::value, "Kokkos Error: corner B defined with non-integral type" );
static_assert( std::is_integral<T>::value, "Kokkos Error: tile defined with non-integral type" );
static_assert( inner_direction != Flat, "Kokkos Error: tiling not support with flat iteration" );
// TODO check size of lists equal to rank
// static_asserts on initializer_list.size() require c++14
//static_assert( corner_a.size() == rank, "Kokkos Error: corner_a has incorrect rank" );
//static_assert( corner_b.size() == rank, "Kokkos Error: corner_b has incorrect rank" );
//static_assert( tile.size() == rank, "Kokkos Error: tile has incorrect rank" );
using A = typename std::make_signed<IA>::type;
using B = typename std::make_signed<IB>::type;
const auto a = [=](int i) { return static_cast<A>(corner_a.begin()[i]); };
const auto b = [=](int i) { return static_cast<B>(corner_b.begin()[i]); };
const auto t = tile.begin();
m_num_tiles = 1;
for (int i=0; i<rank; ++i) {
m_offset[i] = static_cast<index_type>(a(i) <= b(i) ? a(i) : b(i));
m_dim[i] = static_cast<index_type>(a(i) <= b(i) ? b(i) - a(i) : a(i) - b(i));
m_tile[i] = static_cast<int>(t[i] > (T)0 ? t[i] : (T)1 );
m_tile_dim[i] = (m_dim[i] + (m_tile[i] - 1)) / m_tile[i];
m_num_tiles *= m_tile_dim[i];
}
}
index_type m_offset[rank];
index_type m_dim[rank];
int m_tile[rank];
index_type m_tile_dim[rank];
size_type m_num_tiles; // product of tile dims
};
namespace Impl {
// Serial, Threads, OpenMP
// use enable_if to overload for Cuda
template < typename MDRange, typename Functor, typename Enable = void >
struct MDForFunctor
{
using work_tag = typename MDRange::work_tag;
using index_type = typename MDRange::index_type;
using size_type = typename MDRange::size_type;
MDRange m_range;
Functor m_func;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor const& f )
: m_range(range)
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange const& range, Functor && f )
: m_range(range)
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor const& f )
: m_range( std::forward<MDRange>(range) )
, m_func( f )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDRange && range, Functor && f )
: m_range( std::forward<MDRange>(range) )
, m_func( std::forward<Functor>(f) )
{}
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor const& ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor( MDForFunctor && ) = default;
KOKKOS_INLINE_FUNCTION
MDForFunctor& operator=( MDForFunctor && ) = default;
// Rank-2, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
m_func( work_tag{}, m_range.m_offset[0] + ( t / m_range.m_dim[1] )
, m_range.m_offset[1] + ( t % m_range.m_dim[1] ) );
} else {
m_func( work_tag{}, m_range.m_offset[0] + ( t % m_range.m_dim[0] )
, m_range.m_offset[1] + ( t / m_range.m_dim[0] ) );
}
}
// Rank-2, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1 );
}}
}
}
// Rank-2, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 2
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1;
if ( MDRange::outer_direction == MDRange::Right ) {
t0 = t / m_range.m_tile_dim[1];
t1 = t % m_range.m_tile_dim[1];
} else {
t0 = t % m_range.m_tile_dim[0];
t1 = t / m_range.m_tile_dim[0];
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i1=b1; i1<e1; ++i1) {
m_func( tag, i0, i1 );
}}
} else {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1 );
}}
}
}
//---------------------------------------------------------------------------
// Rank-3, Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction == MDRange::Flat
)>::type
operator()(Idx t) const
{
if ( MDRange::outer_direction == MDRange::Right ) {
const int64_t tmp_prod = m_range.m_dim[1]*m_range.m_dim[2];
m_func( work_tag{}
, m_range.m_offset[0] + ( t / tmp_prod )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[2] )
, m_range.m_offset[2] + ( (t % tmp_prod) % m_range.m_dim[2] )
);
} else {
const int64_t tmp_prod = m_range.m_dim[0]*m_range.m_dim[1];
m_func( work_tag{}
, m_range.m_offset[0] + ( (t % tmp_prod) % m_range.m_dim[0] )
, m_range.m_offset[1] + ( (t % tmp_prod) / m_range.m_dim[0] )
, m_range.m_offset[2] + ( t / tmp_prod )
);
}
}
// Rank-3, Not Flat, No Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( i0, i1, i2 );
}}}
}
}
// Rank-3, Not Flat, Tag
template <typename Idx>
KOKKOS_FORCEINLINE_FUNCTION
typename std::enable_if<( std::is_integral<Idx>::value
&& !std::is_same<void, work_tag>::value
&& MDRange::rank == 3
&& MDRange::inner_direction != MDRange::Flat
)>::type
operator()(Idx t) const
{
work_tag tag;
index_type t0, t1, t2;
if ( MDRange::outer_direction == MDRange::Right ) {
const index_type tmp_prod = ( m_range.m_tile_dim[1]*m_range.m_tile_dim[2]);
t0 = t / tmp_prod;
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[2];
t2 = ( t % tmp_prod ) % m_range.m_tile_dim[2];
} else {
const index_type tmp_prod = ( m_range.m_tile_dim[0]*m_range.m_tile_dim[1]);
t0 = ( t % tmp_prod ) % m_range.m_tile_dim[0];
t1 = ( t % tmp_prod ) / m_range.m_tile_dim[0];
t2 = t / tmp_prod;
}
const index_type b0 = t0 * m_range.m_tile[0] + m_range.m_offset[0];
const index_type b1 = t1 * m_range.m_tile[1] + m_range.m_offset[1];
const index_type b2 = t2 * m_range.m_tile[2] + m_range.m_offset[2];
const index_type e0 = b0 + m_range.m_tile[0] <= (m_range.m_dim[0] + m_range.m_offset[0] ) ? b0 + m_range.m_tile[0] : ( m_range.m_dim[0] + m_range.m_offset[0] );
const index_type e1 = b1 + m_range.m_tile[1] <= (m_range.m_dim[1] + m_range.m_offset[1] ) ? b1 + m_range.m_tile[1] : ( m_range.m_dim[1] + m_range.m_offset[1] );
const index_type e2 = b2 + m_range.m_tile[2] <= (m_range.m_dim[2] + m_range.m_offset[2] ) ? b2 + m_range.m_tile[2] : ( m_range.m_dim[2] + m_range.m_offset[2] );
if ( MDRange::inner_direction == MDRange::Right ) {
for (int i0=b0; i0<e0; ++i0) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i2=b2; i2<e2; ++i2) {
m_func( tag, i0, i1, i2 );
}}}
} else {
for (int i2=b2; i2<e2; ++i2) {
for (int i1=b1; i1<e1; ++i1) {
#if defined(KOKKOS_MDRANGE_IVDEP)
#pragma ivdep
#endif
for (int i0=b0; i0<e0; ++i0) {
m_func( tag, i0, i1, i2 );
}}}
}
}
};
} // namespace Impl
template <typename MDRange, typename Functor>
void md_parallel_for( MDRange const& range
, Functor const& f
, const std::string& str = ""
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
using range_policy = typename MDRange::range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
template <typename MDRange, typename Functor>
void md_parallel_for( const std::string& str
, MDRange const& range
, Functor const& f
)
{
Impl::MDForFunctor<MDRange, Functor> g(range, f);
using range_policy = typename MDRange::range_policy;
Kokkos::parallel_for( range_policy(0, range.m_num_tiles).set_chunk_size(1), g, str );
}
}} // namespace Kokkos::Experimental
#endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP

File diff suppressed because it is too large Load Diff

View File

@ -121,13 +121,22 @@ public:
return *this; return *this;
} }
//! Assignment operator. /// \brief Assignment operator, for volatile <tt>*this</tt> and
/// nonvolatile input.
///
/// \param src [in] Input; right-hand side of the assignment.
///
/// This operator returns \c void instead of <tt>volatile
/// complex<RealType>& </tt>. See Kokkos Issue #177 for the
/// explanation. In practice, this means that you should not chain
/// assignments with volatile lvalues.
template<class InputRealType> template<class InputRealType>
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
volatile complex<RealType>& operator= (const complex<InputRealType>& src) volatile { void operator= (const complex<InputRealType>& src) volatile {
re_ = src.re_; re_ = src.re_;
im_ = src.im_; im_ = src.im_;
return *this; // We deliberately do not return anything here. See explanation
// in public documentation above.
} }
//! Assignment operator. //! Assignment operator.

View File

@ -41,81 +41,38 @@
//@HEADER //@HEADER
*/ */
#ifndef KOKKOS_BASIC_ALLOCATORS_HPP #ifndef KOKKOS_CORE_CONCEPTS_HPP
#define KOKKOS_BASIC_ALLOCATORS_HPP #define KOKKOS_CORE_CONCEPTS_HPP
#if ! KOKKOS_USING_EXP_VIEW #include <type_traits>
namespace Kokkos { namespace Impl { namespace Kokkos {
//Schedules for Execution Policies
struct Static {};
struct Dynamic {};
/// class UnmanagedAllocator //Schedule Wrapper Type
/// does nothing when deallocate(ptr,size) is called template<class T>
class UnmanagedAllocator struct Schedule
{ {
public: static_assert( std::is_same<T,Static>::value
static const char * name() { return "Unmanaged Allocator"; } || std::is_same<T,Dynamic>::value
, "Kokkos: Invalid Schedule<> type."
static void deallocate(void * /*ptr*/, size_t /*size*/) {} );
using schedule_type = Schedule<T>;
using type = T;
}; };
//Specify Iteration Index Type
/// class MallocAllocator template<typename T>
class MallocAllocator struct IndexType
{ {
public: static_assert(std::is_integral<T>::value,"Kokkos: Invalid IndexType<>.");
static const char * name() using index_type = IndexType<T>;
{ using type = T;
return "Malloc Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
}; };
} // namespace Kokkos
/// class AlignedAllocator #endif // KOKKOS_CORE_CONCEPTS_HPP
/// memory aligned to Kokkos::Impl::MEMORY_ALIGNMENT
class AlignedAllocator
{
public:
static const char * name()
{
return "Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
/// class PageAlignedAllocator
/// memory aligned to PAGE_SIZE
class PageAlignedAllocator
{
public:
static const char * name()
{
return "Page Aligned Allocator";
}
static void* allocate(size_t size);
static void deallocate(void * ptr, size_t size);
static void * reallocate(void * old_ptr, size_t old_size, size_t new_size);
};
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_BASIC_ALLOCATORS_HPP

View File

@ -159,8 +159,6 @@ void * kokkos_realloc( void * arg_alloc , const size_t arg_alloc_size )
} // namespace Kokkos } // namespace Kokkos
#if KOKKOS_USING_EXP_VIEW
namespace Kokkos { namespace Kokkos {
using Kokkos::Experimental::kokkos_malloc ; using Kokkos::Experimental::kokkos_malloc ;
@ -169,76 +167,6 @@ using Kokkos::Experimental::kokkos_free ;
} }
#else
namespace Kokkos {
namespace Impl {
// should only by used by kokkos_malloc and kokkos_free
struct MallocHelper
{
static void increment_ref_count( AllocationTracker const & tracker )
{
tracker.increment_ref_count();
}
static void decrement_ref_count( AllocationTracker const & tracker )
{
tracker.decrement_ref_count();
}
};
} // namespace Impl
/* Allocate memory from a memory space.
* The allocation is tracked in Kokkos memory tracking system, so
* leaked memory can be identified.
*/
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const std::string label, size_t count) {
if(count == 0) return NULL;
typedef typename Arg::memory_space MemorySpace;
Impl::AllocationTracker tracker = MemorySpace::allocate_and_track(label,count);;
Impl::MallocHelper::increment_ref_count( tracker );
return tracker.alloc_ptr();
}
template< class Arg = DefaultExecutionSpace>
void* kokkos_malloc(const size_t& count) {
return kokkos_malloc<Arg>("DefaultLabel",count);
}
/* Free memory from a memory space.
*/
template< class Arg = DefaultExecutionSpace>
void kokkos_free(const void* ptr) {
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(ptr);
if (tracker.is_valid()) {
Impl::MallocHelper::decrement_ref_count( tracker );
}
}
template< class Arg = DefaultExecutionSpace>
void* kokkos_realloc(const void* old_ptr, size_t size) {
if(old_ptr == NULL)
return kokkos_malloc<Arg>(size);
typedef typename Arg::memory_space MemorySpace;
typedef typename MemorySpace::allocator allocator;
Impl::AllocationTracker tracker = Impl::AllocationTracker::find<allocator>(old_ptr);
tracker.reallocate(size);
return tracker.alloc_ptr();
}
} // namespace Kokkos
#endif
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -69,6 +69,9 @@ namespace {
/**\brief Token to indicate that a parameter's value is to be automatically selected */ /**\brief Token to indicate that a parameter's value is to be automatically selected */
constexpr AUTO_t AUTO = Kokkos::AUTO_t(); constexpr AUTO_t AUTO = Kokkos::AUTO_t();
} }
struct InvalidType {};
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -225,7 +228,7 @@ template< class FunctorType , class ExecPolicy , class ExecutionSpace =
/// ///
/// This is an implementation detail of parallel_reduce. Users should /// This is an implementation detail of parallel_reduce. Users should
/// skip this and go directly to the nonmember function parallel_reduce. /// skip this and go directly to the nonmember function parallel_reduce.
template< class FunctorType , class ExecPolicy , class ExecutionSpace = template< class FunctorType , class ExecPolicy , class ReducerType = InvalidType, class ExecutionSpace =
typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space typename Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
> class ParallelReduce ; > class ParallelReduce ;

View File

@ -56,11 +56,14 @@
#include <Kokkos_CudaSpace.hpp> #include <Kokkos_CudaSpace.hpp>
#include <Kokkos_Parallel.hpp> #include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp> #include <Kokkos_Layout.hpp>
#include <Kokkos_ScratchSpace.hpp> #include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp> #include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
namespace Kokkos { namespace Kokkos {
@ -257,10 +260,10 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Cuda/Kokkos_CudaExec.hpp> #include <Cuda/Kokkos_CudaExec.hpp>
#include <Cuda/Kokkos_Cuda_View.hpp> #include <Cuda/Kokkos_Cuda_View.hpp>
#include <KokkosExp_View.hpp>
#include <Cuda/KokkosExp_Cuda_View.hpp> #include <Cuda/KokkosExp_Cuda_View.hpp>
#include <Cuda/Kokkos_Cuda_Parallel.hpp> #include <Cuda/Kokkos_Cuda_Parallel.hpp>
#include <Cuda/Kokkos_Cuda_Task.hpp>
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -54,10 +54,7 @@
#include <Kokkos_HostSpace.hpp> #include <Kokkos_HostSpace.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Cuda/Kokkos_Cuda_abort.hpp> #include <Cuda/Kokkos_Cuda_abort.hpp>
#include <Cuda/Kokkos_Cuda_BasicAllocators.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
@ -77,33 +74,6 @@ public:
/*--------------------------------*/ /*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaMallocAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/*--------------------------------*/
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaSpace(); CudaSpace();
CudaSpace( CudaSpace && rhs ) = default ; CudaSpace( CudaSpace && rhs ) = default ;
CudaSpace( const CudaSpace & rhs ) = default ; CudaSpace( const CudaSpace & rhs ) = default ;
@ -137,7 +107,7 @@ namespace Impl {
/// where the hash value is derived from the address of the /// where the hash value is derived from the address of the
/// object for which an atomic operation is performed. /// object for which an atomic operation is performed.
/// This function initializes the locks to zero (unset). /// This function initializes the locks to zero (unset).
void init_lock_array_cuda_space(); void init_lock_arrays_cuda_space();
/// \brief Retrieve the pointer to the lock array for arbitrary size atomics. /// \brief Retrieve the pointer to the lock array for arbitrary size atomics.
/// ///
@ -146,7 +116,23 @@ void init_lock_array_cuda_space();
/// object for which an atomic operation is performed. /// object for which an atomic operation is performed.
/// This function retrieves the lock array pointer. /// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so. /// If the array is not yet allocated it will do so.
int* lock_array_cuda_space_ptr(bool deallocate = false); int* atomic_lock_array_cuda_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for team and thread private global memory.
///
/// Team and Thread private scratch allocations in
/// global memory are aquired via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* scratch_lock_array_cuda_space_ptr(bool deallocate = false);
/// \brief Retrieve the pointer to the scratch array for unique identifiers.
///
/// Unique identifiers in the range 0-Cuda::concurrency
/// are provided via locks.
/// This function retrieves the lock array pointer.
/// If the array is not yet allocated it will do so.
int* threadid_lock_array_cuda_space_ptr(bool deallocate = false);
} }
} // namespace Kokkos } // namespace Kokkos
@ -172,33 +158,6 @@ public:
/*--------------------------------*/ /*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaUVMAllocator allocator;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
/** \brief Cuda specific function to attached texture object to an allocation.
* Output the texture object, base pointer, and offset from the input pointer.
*/
#if defined( __CUDACC__ )
static void texture_object_attach( Impl::AllocationTracker const & tracker
, unsigned type_size
, ::cudaChannelFormatDesc const & desc
);
#endif
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaUVMSpace(); CudaUVMSpace();
CudaUVMSpace( CudaUVMSpace && rhs ) = default ; CudaUVMSpace( CudaUVMSpace && rhs ) = default ;
CudaUVMSpace( const CudaUVMSpace & rhs ) = default ; CudaUVMSpace( const CudaUVMSpace & rhs ) = default ;
@ -242,22 +201,6 @@ public:
/*--------------------------------*/ /*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::CudaHostAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/
CudaHostPinnedSpace(); CudaHostPinnedSpace();
CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ; CudaHostPinnedSpace( CudaHostPinnedSpace && rhs ) = default ;
CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ; CudaHostPinnedSpace( const CudaHostPinnedSpace & rhs ) = default ;

View File

@ -47,167 +47,15 @@
#include <Kokkos_Core_fwd.hpp> #include <Kokkos_Core_fwd.hpp>
#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_StaticAssert.hpp> #include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_AnalyzePolicy.hpp>
#include <Kokkos_Concepts.hpp>
#include <iostream> #include <iostream>
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
namespace Kokkos { namespace Kokkos {
//Schedules for Execution Policies
struct Static {
};
struct Dynamic {
};
//Schedule Wrapper Type
template<class ScheduleType>
struct Schedule {
static_assert(std::is_same<ScheduleType,Static>::value ||
std::is_same<ScheduleType,Dynamic>::value,
"Kokkos: Invalid Schedule<> type.");
typedef Schedule<ScheduleType> schedule_type;
typedef ScheduleType type;
};
//Specif Iteration Index Type
template<typename iType>
struct IndexType {
static_assert(std::is_integral<iType>::value,"Kokkos: Invalid IndexType<>.");
typedef IndexType<iType> index_type;
typedef iType type;
};
namespace Impl {
template<class Arg>
struct is_schedule_type {
enum { value = 0};
};
template<class ScheduleType>
struct is_schedule_type<Schedule<ScheduleType> > {
enum {value = 1 };
};
template<class Arg>
struct is_index_type {
enum { value = 0 };
};
template<typename iType>
struct is_index_type<IndexType<iType> > {
enum { value = 1 };
};
template<typename Arg>
struct is_tag_type {
enum { value = !(is_execution_space<Arg>::value ||
is_schedule_type<Arg>::value ||
is_index_type<Arg>::value ||
std::is_integral<Arg>::value)};
};
//Policy Traits
template<class ... Properties>
struct PolicyTraits;
template<>
struct PolicyTraits<void> {
typedef void execution_space;
typedef void schedule_type;
typedef void index_type;
typedef void tag_type;
};
//Strip off ExecutionSpace
template<class ExecutionSpace, class ... Props>
struct PolicyTraits<typename std::enable_if<is_execution_space<ExecutionSpace>::value >::type,ExecutionSpace,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::execution_space, void>::value,
"ExecutionPolicy: Only one execution space template argument may be used.");
typedef ExecutionSpace execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off ScheduleType
template<class ScheduleType, class ... Props>
struct PolicyTraits<typename std::enable_if<is_schedule_type<Schedule<ScheduleType> >::value >::type,Schedule<ScheduleType>,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::schedule_type, void>::value,
"ExecutionPolicy: Only one Schedule<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef ScheduleType schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off IndexType
template<typename iType, class ... Props>
struct PolicyTraits<void, IndexType<iType>,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef iType index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off raw IndexType
template<typename iType, class ... Props>
struct PolicyTraits<typename std::enable_if<std::is_integral<iType>::value>::type, iType,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::index_type, void>::value,
"ExecutionPolicy: Only one IndexType<..> template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef iType index_type;
typedef typename PolicyTraits<void, Props ...>::tag_type tag_type;
};
//Strip off TagType
template<class TagType, class ... Props>
struct PolicyTraits<typename std::enable_if<!is_schedule_type<TagType>::value &&
!is_execution_space<TagType>::value &&
!is_index_type<TagType>::value &&
!std::is_integral<TagType>::value
>::type,
TagType,Props ...> {
static_assert( std::is_same<typename PolicyTraits<void, Props ...>::tag_type, void>::value,
"ExecutionPolicy: Only one tag type template argument may be used.");
typedef typename PolicyTraits<void, Props ...>::execution_space execution_space;
typedef typename PolicyTraits<void, Props ...>::schedule_type schedule_type;
typedef typename PolicyTraits<void, Props ...>::index_type index_type;
typedef TagType tag_type;
};
template<class ... Props>
struct PolicyTraits {
#ifdef KOKKOS_DIRECT_VARIADIC_EXPANSION
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::execution_space>::value,
Kokkos::DefaultExecutionSpace, typename PolicyTraits<void,Props ...>::execution_space>::type execution_space;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::schedule_type>::value,
Kokkos::Static, typename PolicyTraits<void,Props ...>::schedule_type>::type schedule_type;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::index_type>::value,
typename execution_space::size_type, typename PolicyTraits<void,Props ...>::index_type>::type index_type;
typedef typename std::conditional<std::is_same<void, typename PolicyTraits<void, Props ...>::tag_type>::value,
void, typename PolicyTraits<void,Props ...>::tag_type>::type work_tag;
#else
typedef typename has_condition<Kokkos::DefaultExecutionSpace,is_execution_space,Props ...>::type execution_space;
typedef typename has_condition<Kokkos::Schedule<Kokkos::Static>,is_schedule_type,Props ...>::type schedule_type;
typedef typename has_condition<void,is_tag_type,Props ...>::type work_tag;
typedef typename has_condition<typename execution_space::size_type, std::is_integral, Props ... >::type default_index_type;
typedef typename has_condition<Kokkos::IndexType<default_index_type>,is_index_type,Props ...>::type::type index_type;
#endif
};
}
}
namespace Kokkos {
/** \brief Execution policy for work over a range of an integral type. /** \brief Execution policy for work over a range of an integral type.
* *
* Valid template argument options: * Valid template argument options:
@ -230,7 +78,9 @@ namespace Kokkos {
* Blocking is the granularity of partitioning the range among threads. * Blocking is the granularity of partitioning the range among threads.
*/ */
template<class ... Properties> template<class ... Properties>
class RangePolicy: public Impl::PolicyTraits<Properties ... > { class RangePolicy
: public Impl::PolicyTraits<Properties ... >
{
private: private:
typedef Impl::PolicyTraits<Properties ... > traits; typedef Impl::PolicyTraits<Properties ... > traits;
@ -243,6 +93,7 @@ private:
public: public:
//! Tag this class as an execution policy //! Tag this class as an execution policy
typedef RangePolicy execution_policy;
typedef typename traits::index_type member_type ; typedef typename traits::index_type member_type ;
KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; } KOKKOS_INLINE_FUNCTION const typename traits::execution_space & space() const { return m_space ; }
@ -370,6 +221,7 @@ public:
}; };
}; };
} // namespace Kokkos } // namespace Kokkos
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -377,38 +229,6 @@ public:
namespace Kokkos { namespace Kokkos {
namespace Experimental {
/** \brief Scratch memory request accepting per team and per thread value
*
* An instance of this class can be given as the last argument to a
* TeamPolicy constructor. It sets the amount of user requested shared
* memory for the team.
*/
template< class MemorySpace >
class TeamScratchRequest {
size_t m_per_team;
size_t m_per_thread;
public:
TeamScratchRequest(size_t per_team_, size_t per_thread_ = 0):
m_per_team(per_team_), m_per_thread(per_thread_) {
}
size_t per_team() const {
return m_per_team;
}
size_t per_thread() const {
return m_per_thread;
}
size_t total(const size_t team_size) const {
return m_per_team + m_per_thread * team_size;
}
};
}
namespace Impl { namespace Impl {
@ -451,11 +271,9 @@ public:
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 ); TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 );
template<class MemorySpace> /* TeamPolicyInternal( int league_size_request , int team_size_request );
TeamPolicyInternal( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
template<class MemorySpace> TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & );*/
TeamPolicyInternal( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request );
/** \brief The actual league size (number of teams) of the policy. /** \brief The actual league size (number of teams) of the policy.
* *
@ -574,9 +392,11 @@ class TeamPolicy: public
typedef Impl::TeamPolicyInternal< typedef Impl::TeamPolicyInternal<
typename Impl::PolicyTraits<Properties ... >::execution_space, typename Impl::PolicyTraits<Properties ... >::execution_space,
Properties ...> internal_policy; Properties ...> internal_policy;
typedef Impl::PolicyTraits<Properties ... > traits; typedef Impl::PolicyTraits<Properties ... > traits;
public: public:
typedef TeamPolicy execution_policy;
TeamPolicy& operator = (const TeamPolicy&) = default; TeamPolicy& operator = (const TeamPolicy&) = default;
@ -594,13 +414,11 @@ public:
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 ) TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , int vector_length_request = 1 )
: internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {} : internal_policy(league_size_request,Kokkos::AUTO(), vector_length_request) {}
template<class MemorySpace> /* TeamPolicy( int league_size_request , int team_size_request )
TeamPolicy( int league_size_request , int team_size_request , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ) : internal_policy(league_size_request,team_size_request) {}
: internal_policy(league_size_request,team_size_request, team_scratch_memory_request) {}
template<class MemorySpace> TeamPolicy( int league_size_request , const Kokkos::AUTO_t & )
TeamPolicy( int league_size_request , const Kokkos::AUTO_t & , const Experimental::TeamScratchRequest<MemorySpace>& team_scratch_memory_request ) : internal_policy(league_size_request,Kokkos::AUTO()) {}*/
: internal_policy(league_size_request,Kokkos::AUTO(), team_scratch_memory_request) {}
private: private:
TeamPolicy(const internal_policy& p):internal_policy(p) {} TeamPolicy(const internal_policy& p):internal_policy(p) {}
@ -744,6 +562,7 @@ Impl::ThreadVectorRangeBoundariesStruct<iType,TeamMemberType> ThreadVectorRange(
} // namespace Kokkos } // namespace Kokkos
#endif /* #define KOKKOS_EXECPOLICY_HPP */ #endif /* #define KOKKOS_EXECPOLICY_HPP */
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -120,21 +120,6 @@ public:
//! This memory space preferred device_type //! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type; typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
typedef Impl::HBWMallocAllocator allocator ;
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Kokkos::Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/ /*--------------------------------*/
/* Functions unique to the HBWSpace */ /* Functions unique to the HBWSpace */
static int in_parallel(); static int in_parallel();

View File

@ -55,9 +55,6 @@
#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/KokkosExp_SharedAlloc.hpp> #include <impl/KokkosExp_SharedAlloc.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
@ -128,25 +125,6 @@ public:
//! This memory space preferred device_type //! This memory space preferred device_type
typedef Kokkos::Device<execution_space,memory_space> device_type; typedef Kokkos::Device<execution_space,memory_space> device_type;
/*--------------------------------*/
#if ! KOKKOS_USING_EXP_VIEW
#if defined( KOKKOS_USE_PAGE_ALIGNED_HOST_MEMORY )
typedef Impl::PageAlignedAllocator allocator ;
#else
typedef Impl::AlignedAllocator allocator ;
#endif
/** \brief Allocate a contiguous block of memory.
*
* The input label is associated with the block of memory.
* The block of memory is tracked via reference counting where
* allocation gives it a reference count of one.
*/
static Impl::AllocationTracker allocate_and_track( const std::string & label, const size_t size );
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
/*--------------------------------*/ /*--------------------------------*/
/* Functions unique to the HostSpace */ /* Functions unique to the HostSpace */
static int in_parallel(); static int in_parallel();

View File

@ -133,11 +133,23 @@
// still identifies as 7.0 // still identifies as 7.0
#error "Cuda version 7.5 or greater required for host-to-device Lambda support" #error "Cuda version 7.5 or greater required for host-to-device Lambda support"
#endif #endif
#if ( CUDA_VERSION < 8000 )
#define KOKKOS_LAMBDA [=]__device__ #define KOKKOS_LAMBDA [=]__device__
#else
#define KOKKOS_LAMBDA [=]__host__ __device__
#endif
#define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1 #define KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA 1
#endif #endif
#endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */ #endif /* #if defined( KOKKOS_HAVE_CUDA ) && defined( __CUDACC__ ) */
#if defined(KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA)
// Cuda version 8.0 still needs the functor wrapper
#if (KOKKOS_HAVE_CXX11_DISPATCH_LAMBDA /* && (CUDA_VERSION < 8000) */ )
#define KOKKOS_IMPL_NEED_FUNCTOR_WRAPPER
#endif
#endif
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
/* Language info: C++, CUDA, OPENMP */ /* Language info: C++, CUDA, OPENMP */
@ -440,27 +452,16 @@
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
/* Transitional macro to change between old and new View, /* Transitional macro to change between old and new View
* default to use new View. * are no longer supported.
*/ */
#if ! defined( KOKKOS_USING_EXP_VIEW )
#if defined( KOKKOS_USING_DEPRECATED_VIEW ) #if defined( KOKKOS_USING_DEPRECATED_VIEW )
#define KOKKOS_USING_EXP_VIEW 0 #error "Kokkos deprecated View has been removed"
#else
#define KOKKOS_USING_EXP_VIEW 1
#endif
#endif #endif
#if KOKKOS_USING_EXP_VIEW #define KOKKOS_USING_EXP_VIEW 1
#if ! defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#define KOKKOS_USING_EXPERIMENTAL_VIEW #define KOKKOS_USING_EXPERIMENTAL_VIEW
#endif
#else /* ! KOKKOS_USING_EXP_VIEW */
#if defined( KOKKOS_USING_EXPERIMENTAL_VIEW )
#error "KOKKOS_USING_EXP_VIEW and KOKKOS_USING_EXPERIMENAL_VIEW are both defined and are incompatible"
#endif
#endif
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -58,9 +58,11 @@
#endif #endif
#include <Kokkos_ScratchSpace.hpp> #include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_Parallel.hpp> #include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp> #include <Kokkos_Layout.hpp>
#include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Tags.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
namespace Kokkos { namespace Kokkos {
@ -177,6 +179,7 @@ struct VerifyExecutionCanAccessMemorySpace
#include <OpenMP/Kokkos_OpenMPexec.hpp> #include <OpenMP/Kokkos_OpenMPexec.hpp>
#include <OpenMP/Kokkos_OpenMP_Parallel.hpp> #include <OpenMP/Kokkos_OpenMP_Parallel.hpp>
#include <OpenMP/Kokkos_OpenMP_Task.hpp>
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/

View File

@ -125,17 +125,26 @@ struct pair
return *this; return *this;
} }
/// \brief Assignment operator.
/// \brief Assignment operator, for volatile <tt>*this</tt>.
/// ///
/// This calls the assignment operators of T1 and T2. It won't /// \param p [in] Input; right-hand side of the assignment.
///
/// This calls the assignment operators of T1 and T2. It will not
/// compile if the assignment operators are not defined and public. /// compile if the assignment operators are not defined and public.
///
/// This operator returns \c void instead of <tt>volatile pair<T1,
/// T2>& </tt>. See Kokkos Issue #177 for the explanation. In
/// practice, this means that you should not chain assignments with
/// volatile lvalues.
template <class U, class V> template <class U, class V>
KOKKOS_FORCEINLINE_FUNCTION KOKKOS_FORCEINLINE_FUNCTION
volatile pair<T1, T2> & operator=(const volatile pair<U,V> &p) volatile void operator=(const volatile pair<U,V> &p) volatile
{ {
first = p.first; first = p.first;
second = p.second; second = p.second;
return *this; // We deliberately do not return anything here. See explanation
// in public documentation above.
} }
// from std::pair<U,V> // from std::pair<U,V>

View File

@ -57,7 +57,6 @@
#include <typeinfo> #include <typeinfo>
#endif #endif
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_FunctorAdapter.hpp>
@ -178,8 +177,8 @@ void parallel_for( const ExecPolicy & policy
{ {
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0; uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
} }
#endif #endif
@ -190,8 +189,8 @@ void parallel_for( const ExecPolicy & policy
closure.execute(); closure.execute();
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID); Kokkos::Profiling::endParallelFor(kpID);
} }
#endif #endif
} }
@ -210,8 +209,8 @@ void parallel_for( const size_t work_count
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0; uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID); Kokkos::Profiling::beginParallelFor("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
} }
#endif #endif
@ -222,8 +221,8 @@ void parallel_for( const size_t work_count
closure.execute(); closure.execute();
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelFor(kpID); Kokkos::Profiling::endParallelFor(kpID);
} }
#endif #endif
} }
@ -248,405 +247,9 @@ void parallel_for( const std::string & str
(void) str; (void) str;
} }
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
/** \brief Parallel reduction
*
* Example of a parallel_reduce functor for a POD (plain old data) value type:
* \code
* class FunctorType { // For POD value type
* public:
* typedef ... execution_space ;
* typedef <podType> value_type ;
* void operator()( <intType> iwork , <podType> & update ) const ;
* void init( <podType> & update ) const ;
* void join( volatile <podType> & update ,
* volatile const <podType> & input ) const ;
*
* typedef true_type has_final ;
* void final( <podType> & update ) const ;
* };
* \endcode
*
* Example of a parallel_reduce functor for an array of POD (plain old data) values:
* \code
* class FunctorType { // For array of POD value
* public:
* typedef ... execution_space ;
* typedef <podType> value_type[] ;
* void operator()( <intType> , <podType> update[] ) const ;
* void init( <podType> update[] ) const ;
* void join( volatile <podType> update[] ,
* volatile const <podType> input[] ) const ;
*
* typedef true_type has_final ;
* void final( <podType> update[] ) const ;
* };
* \endcode
*/
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const std::string& str = ""
, typename Impl::enable_if< ! Impl::is_integral< ExecPolicy >::value >::type * = 0
)
{
// typedef typename
// Impl::FunctorPolicyExecutionSpace< FunctorType , ExecPolicy >::execution_space
// execution_space ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
} }
// integral range policy #include <Kokkos_Parallel_Reduce.hpp>
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const std::string& str = ""
)
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > policy ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and view ouput
template< class ExecPolicy , class FunctorType , class ViewType >
inline
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<
( Kokkos::is_view<ViewType>::value && ! Impl::is_integral< ExecPolicy >::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// general policy and pod or array of pod output
template< class ExecPolicy , class FunctorType >
void parallel_reduce( const ExecPolicy & policy
, const FunctorType & functor
#ifdef KOKKOS_HAVE_CUDA
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value &&
! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value )
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type>::type result_ref
, const std::string& str = ""
, typename Impl::enable_if<! Impl::is_same<typename ExecPolicy::execution_space,Kokkos::Cuda>::value >::type* = 0
)
#else
, typename Impl::enable_if<
( ! Impl::is_integral< ExecPolicy >::value)
, typename Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag >::reference_type
>::type result_ref
, const std::string& str = ""
)
#endif
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , typename ExecPolicy::work_tag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , typename ExecPolicy::work_tag > ValueOps ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result_ref )
, ValueTraits::value_count( functor )
);
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , policy , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and view ouput
template< class FunctorType , class ViewType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, const ViewType & result_view
, const std::string& str = ""
, typename Impl::enable_if<( Kokkos::is_view<ViewType>::value
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
)>::type * = 0 )
{
typedef typename
Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef RangePolicy< execution_space > ExecPolicy ;
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType, ExecPolicy > closure( functor , ExecPolicy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
// integral range policy and pod or array of pod output
template< class FunctorType >
inline
void parallel_reduce( const size_t work_count
, const FunctorType & functor
, typename Kokkos::Impl::FunctorValueTraits<
typename Impl::if_c<Impl::is_execution_policy<FunctorType>::value ||
Impl::is_integral<FunctorType>::value,
void,FunctorType>::type
, void >::reference_type result
, const std::string& str = ""
, typename Impl::enable_if< true
#ifdef KOKKOS_HAVE_CUDA
&& ! Impl::is_same<
typename Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space,
Kokkos::Cuda>::value
#endif
>::type * = 0 )
{
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , void > ValueTraits ;
typedef Kokkos::Impl::FunctorValueOps< FunctorType , void > ValueOps ;
typedef typename
Kokkos::Impl::FunctorPolicyExecutionSpace< FunctorType , void >::execution_space
execution_space ;
typedef Kokkos::RangePolicy< execution_space > policy ;
// Wrap the result output request in a view to inform the implementation
// of the type and memory space.
typedef typename Kokkos::Impl::if_c< (ValueTraits::StaticValueSize != 0)
, typename ValueTraits::value_type
, typename ValueTraits::pointer_type
>::type value_type ;
Kokkos::View< value_type
, HostSpace
, Kokkos::MemoryUnmanaged
>
result_view( ValueOps::pointer( result )
, ValueTraits::value_count( functor )
);
#if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelReduce("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
}
#endif
Kokkos::Impl::shared_allocation_tracking_claim_and_disable();
Impl::ParallelReduce< FunctorType , policy > closure( functor , policy(0,work_count) , result_view );
Kokkos::Impl::shared_allocation_tracking_release_and_enable();
closure.execute();
#if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelReduce(kpID);
}
#endif
}
#ifndef KOKKOS_HAVE_CUDA
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType * result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType , class ResultType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor
, ResultType & result)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,result,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
template< class ExecPolicy , class FunctorType >
inline
void parallel_reduce( const std::string & str
, const ExecPolicy & policy
, const FunctorType & functor)
{
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG Start parallel_reduce kernel: " << str << std::endl;
#endif
parallel_reduce(policy,functor,str);
#if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES
Kokkos::fence();
std::cout << "KOKKOS_DEBUG End parallel_reduce kernel: " << str << std::endl;
#endif
(void) str;
}
#endif
} // namespace Kokkos
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -816,8 +419,8 @@ void parallel_scan( const ExecutionPolicy & policy
{ {
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0; uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
} }
#endif #endif
@ -828,8 +431,8 @@ void parallel_scan( const ExecutionPolicy & policy
closure.execute(); closure.execute();
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID); Kokkos::Profiling::endParallelScan(kpID);
} }
#endif #endif
@ -849,8 +452,8 @@ void parallel_scan( const size_t work_count
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
uint64_t kpID = 0; uint64_t kpID = 0;
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID); Kokkos::Profiling::beginParallelScan("" == str ? typeid(FunctorType).name() : str, 0, &kpID);
} }
#endif #endif
@ -861,8 +464,8 @@ void parallel_scan( const size_t work_count
closure.execute(); closure.execute();
#if (KOKKOS_ENABLE_PROFILING) #if (KOKKOS_ENABLE_PROFILING)
if(Kokkos::Experimental::profileLibraryLoaded()) { if(Kokkos::Profiling::profileLibraryLoaded()) {
Kokkos::Experimental::endParallelScan(kpID); Kokkos::Profiling::endParallelScan(kpID);
} }
#endif #endif

File diff suppressed because it is too large Load Diff

View File

@ -66,11 +66,15 @@ public:
private: private:
mutable char * m_iter ; mutable char * m_iter_L0 ;
char * m_end ; char * m_end_L0 ;
mutable char * m_iter_L1 ;
char * m_end_L1 ;
mutable int m_multiplier; mutable int m_multiplier;
mutable int m_offset; mutable int m_offset;
mutable int m_default_level;
ScratchMemorySpace(); ScratchMemorySpace();
ScratchMemorySpace & operator = ( const ScratchMemorySpace & ); ScratchMemorySpace & operator = ( const ScratchMemorySpace & );
@ -95,34 +99,58 @@ public:
template< typename IntType > template< typename IntType >
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
void* get_shmem (const IntType& size) const { void* get_shmem (const IntType& size, int level = -1) const {
void* tmp = m_iter + m_offset * align (size); if(level == -1)
if (m_end < (m_iter += align (size) * m_multiplier)) { level = m_default_level;
m_iter -= align (size) * m_multiplier; // put it back like it was if(level == 0) {
void* tmp = m_iter_L0 + m_offset * align (size);
if (m_end_L0 < (m_iter_L0 += align (size) * m_multiplier)) {
m_iter_L0 -= align (size) * m_multiplier; // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG #ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers // mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The // in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory. // function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate " printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size), "%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end-m_iter)); long(m_end_L0-m_iter_L0));
#endif // KOKKOS_HAVE_DEBUG #endif // KOKKOS_HAVE_DEBUG
tmp = 0; tmp = 0;
} }
return tmp; return tmp;
} else {
void* tmp = m_iter_L1 + m_offset * align (size);
if (m_end_L1 < (m_iter_L1 += align (size) * m_multiplier)) {
m_iter_L1 -= align (size) * m_multiplier; // put it back like it was
#ifdef KOKKOS_HAVE_DEBUG
// mfh 23 Jun 2015: printf call consumes 25 registers
// in a CUDA build, so only print in debug mode. The
// function still returns NULL if not enough memory.
printf ("ScratchMemorySpace<...>::get_shmem: Failed to allocate "
"%ld byte(s); remaining capacity is %ld byte(s)\n", long(size),
long(m_end_L1-m_iter_L1));
#endif // KOKKOS_HAVE_DEBUG
tmp = 0;
}
return tmp;
}
} }
template< typename IntType > template< typename IntType >
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
ScratchMemorySpace( void * ptr , const IntType & size ) ScratchMemorySpace( void * ptr_L0 , const IntType & size_L0 , void * ptr_L1 = NULL , const IntType & size_L1 = 0)
: m_iter( (char *) ptr ) : m_iter_L0( (char *) ptr_L0 )
, m_end( m_iter + size ) , m_end_L0( m_iter_L0 + size_L0 )
, m_iter_L1( (char *) ptr_L1 )
, m_end_L1( m_iter_L1 + size_L1 )
, m_multiplier( 1 ) , m_multiplier( 1 )
, m_offset( 0 ) , m_offset( 0 )
, m_default_level( 0 )
{} {}
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const ScratchMemorySpace& set_team_thread_mode(const int& multiplier, const int& offset) const { const ScratchMemorySpace& set_team_thread_mode(const int& level, const int& multiplier, const int& offset) const {
m_default_level = level;
m_multiplier = multiplier; m_multiplier = multiplier;
m_offset = offset; m_offset = offset;
return *this; return *this;

View File

@ -50,12 +50,17 @@
#include <cstddef> #include <cstddef>
#include <iosfwd> #include <iosfwd>
#include <Kokkos_Parallel.hpp> #include <Kokkos_Parallel.hpp>
#include <Kokkos_TaskPolicy.hpp>
#include <Kokkos_Layout.hpp> #include <Kokkos_Layout.hpp>
#include <Kokkos_HostSpace.hpp> #include <Kokkos_HostSpace.hpp>
#include <Kokkos_ScratchSpace.hpp> #include <Kokkos_ScratchSpace.hpp>
#include <Kokkos_MemoryTraits.hpp> #include <Kokkos_MemoryTraits.hpp>
#include <impl/Kokkos_Tags.hpp> #include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
#if defined( KOKKOS_HAVE_SERIAL ) #if defined( KOKKOS_HAVE_SERIAL )
@ -142,7 +147,9 @@ public:
// Init the array of locks used for arbitrarily sized atomics // Init the array of locks used for arbitrarily sized atomics
Impl::init_lock_array_host_space(); Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
} }
static int is_initialized() { return 1 ; } static int is_initialized() { return 1 ; }
@ -151,7 +158,11 @@ public:
static int concurrency() {return 1;}; static int concurrency() {return 1;};
//! Free any resources being consumed by the device. //! Free any resources being consumed by the device.
static void finalize() {} static void finalize() {
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
}
//! Print configuration information to the given output stream. //! Print configuration information to the given output stream.
static void print_configuration( std::ostream & , const bool /* detail */ = false ) {} static void print_configuration( std::ostream & , const bool /* detail */ = false ) {}
@ -307,8 +318,8 @@ class TeamPolicyInternal< Kokkos::Serial , Properties ... >:public PolicyTraits<
{ {
private: private:
size_t m_team_scratch_size ; size_t m_team_scratch_size[2] ;
size_t m_thread_scratch_size ; size_t m_thread_scratch_size[2] ;
int m_league_size ; int m_league_size ;
int m_chunk_size; int m_chunk_size;
@ -324,8 +335,10 @@ public:
TeamPolicyInternal& operator = (const TeamPolicyInternal& p) { TeamPolicyInternal& operator = (const TeamPolicyInternal& p) {
m_league_size = p.m_league_size; m_league_size = p.m_league_size;
m_team_scratch_size = p.m_team_scratch_size; m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size = p.m_thread_scratch_size; m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size; m_chunk_size = p.m_chunk_size;
return *this; return *this;
} }
@ -348,15 +361,15 @@ public:
inline int team_size() const { return 1 ; } inline int team_size() const { return 1 ; }
inline int league_size() const { return m_league_size ; } inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_thread_scratch_size; } inline size_t scratch_size(const int& level, int = 0) const { return m_team_scratch_size[level] + m_thread_scratch_size[level]; }
/** \brief Specify league size, request team size */ /** \brief Specify league size, request team size */
TeamPolicyInternal( execution_space & TeamPolicyInternal( execution_space &
, int league_size_request , int league_size_request
, int /* team_size_request */ , int /* team_size_request */
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request ) , m_league_size( league_size_request )
, m_chunk_size ( 32 ) , m_chunk_size ( 32 )
{} {}
@ -365,8 +378,8 @@ public:
, int league_size_request , int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */ , const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request ) , m_league_size( league_size_request )
, m_chunk_size ( 32 ) , m_chunk_size ( 32 )
{} {}
@ -374,8 +387,8 @@ public:
TeamPolicyInternal( int league_size_request TeamPolicyInternal( int league_size_request
, int /* team_size_request */ , int /* team_size_request */
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request ) , m_league_size( league_size_request )
, m_chunk_size ( 32 ) , m_chunk_size ( 32 )
{} {}
@ -383,8 +396,8 @@ public:
TeamPolicyInternal( int league_size_request TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */ , const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_league_size( league_size_request ) , m_league_size( league_size_request )
, m_chunk_size ( 32 ) , m_chunk_size ( 32 )
{} {}
@ -401,26 +414,23 @@ public:
/** \brief set per team scratch size for a specific level of the scratch hierarchy */ /** \brief set per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
return p; return p;
}; };
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */ /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */ /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };
@ -489,9 +499,10 @@ public:
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
template< class FunctorType , class ... Traits > template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... > , Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Serial , Kokkos::Serial
> >
{ {
@ -499,14 +510,19 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
@ -515,15 +531,15 @@ private:
typename std::enable_if< std::is_same< TagType , void >::value >::type typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const exec( pointer_type ptr ) const
{ {
reference_type update = ValueInit::init( m_functor , ptr ); reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end(); const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( i , update ); m_functor( i , update );
} }
Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( m_functor , ptr ); final( ReducerConditional::select(m_functor , m_reducer) , ptr );
} }
template< class TagType > template< class TagType >
@ -532,15 +548,15 @@ private:
exec( pointer_type ptr ) const exec( pointer_type ptr ) const
{ {
const TagType t{} ; const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr ); reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
const typename Policy::member_type e = m_policy.end(); const typename Policy::member_type e = m_policy.end();
for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) { for ( typename Policy::member_type i = m_policy.begin() ; i < e ; ++i ) {
m_functor( t , i , update ); m_functor( t , i , update );
} }
Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( m_functor , ptr ); final( ReducerConditional::select(m_functor , m_reducer) , ptr );
} }
public: public:
@ -549,25 +565,43 @@ public:
void execute() const void execute() const
{ {
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , 0 ); ( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr ); this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
} }
template< class ViewType > template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor ParallelReduce( const FunctorType & arg_functor ,
, const Policy & arg_policy const Policy & arg_policy ,
, const ViewType & arg_result ) const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_result_ptr( arg_result.ptr_on_device() ) , m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() )
{ {
static_assert( Kokkos::is_view< ViewType >::value static_assert( Kokkos::is_view< HostViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" ); , "Kokkos::Serial reduce result must be a View" );
static_assert( std::is_same< typename ViewType::memory_space static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Serial reduce result must be a View in HostSpace" );
}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value , Kokkos::HostSpace >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" ); , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
} }
}; };
@ -697,15 +731,16 @@ public:
, const Policy & arg_policy ) , const Policy & arg_policy )
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_league( arg_policy.league_size() ) , m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) ) , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , 1 ) )
{ } { }
}; };
/*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/
template< class FunctorType , class ... Properties > template< class FunctorType , class ReducerType , class ... Properties >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... > , Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Serial , Kokkos::Serial
> >
{ {
@ -714,30 +749,35 @@ private:
typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ; typedef TeamPolicyInternal< Kokkos::Serial, Properties ... > Policy ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const int m_league ; const int m_league ;
const int m_shared ; const ReducerType m_reducer ;
pointer_type m_result_ptr ; pointer_type m_result_ptr ;
const int m_shared ;
template< class TagType > template< class TagType >
inline inline
typename std::enable_if< std::is_same< TagType , void >::value >::type typename std::enable_if< std::is_same< TagType , void >::value >::type
exec( pointer_type ptr ) const exec( pointer_type ptr ) const
{ {
reference_type update = ValueInit::init( m_functor , ptr ); reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) { for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( Member(ileague,m_league,m_shared) , update ); m_functor( Member(ileague,m_league,m_shared) , update );
} }
Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( m_functor , ptr ); final( ReducerConditional::select(m_functor , m_reducer) , ptr );
} }
template< class TagType > template< class TagType >
@ -747,14 +787,14 @@ private:
{ {
const TagType t{} ; const TagType t{} ;
reference_type update = ValueInit::init( m_functor , ptr ); reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , ptr );
for ( int ileague = 0 ; ileague < m_league ; ++ileague ) { for ( int ileague = 0 ; ileague < m_league ; ++ileague ) {
m_functor( t , Member(ileague,m_league,m_shared) , update ); m_functor( t , Member(ileague,m_league,m_shared) , update );
} }
Kokkos::Impl::FunctorFinal< FunctorType , TagType >:: Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagType >::
final( m_functor , ptr ); final( ReducerConditional::select(m_functor , m_reducer) , ptr );
} }
public: public:
@ -763,7 +803,7 @@ public:
void execute() const void execute() const
{ {
pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize pointer_type ptr = (pointer_type) Kokkos::Serial::scratch_memory_resize
( ValueTraits::value_size( m_functor ) , m_shared ); ( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , m_shared );
this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr ); this-> template exec< WorkTag >( m_result_ptr ? m_result_ptr : ptr );
} }
@ -771,12 +811,16 @@ public:
template< class ViewType > template< class ViewType >
ParallelReduce( const FunctorType & arg_functor ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy , const Policy & arg_policy
, const ViewType & arg_result , const ViewType & arg_result ,
) typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_league( arg_policy.league_size() ) , m_league( arg_policy.league_size() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) ) , m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() ) , m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( m_functor , 1 ) )
{ {
static_assert( Kokkos::is_view< ViewType >::value static_assert( Kokkos::is_view< ViewType >::value
, "Reduction result on Kokkos::Serial must be a Kokkos::View" ); , "Reduction result on Kokkos::Serial must be a Kokkos::View" );
@ -786,6 +830,21 @@ public:
, "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" ); , "Reduction result on Kokkos::Serial must be a Kokkos::View in HostSpace" );
} }
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_league( arg_policy.league_size() )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
}; };
} // namespace Impl } // namespace Impl
@ -1045,6 +1104,10 @@ void single(const Impl::ThreadSingleStruct<Impl::SerialTeamMember>& , const Func
} }
} }
//----------------------------------------------------------------------------
#include <impl/Kokkos_Serial_Task.hpp>
#endif // defined( KOKKOS_HAVE_SERIAL ) #endif // defined( KOKKOS_HAVE_SERIAL )
#endif /* #define KOKKOS_SERIAL_HPP */ #endif /* #define KOKKOS_SERIAL_HPP */

View File

@ -1,4 +1,3 @@
/* /*
//@HEADER //@HEADER
// ************************************************************************ // ************************************************************************
@ -47,13 +46,655 @@
#ifndef KOKKOS_TASKPOLICY_HPP #ifndef KOKKOS_TASKPOLICY_HPP
#define KOKKOS_TASKPOLICY_HPP #define KOKKOS_TASKPOLICY_HPP
#include <Kokkos_Core_fwd.hpp> //----------------------------------------------------------------------------
#include <Kokkos_MemoryPool.hpp>
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_StaticAssert.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Core_fwd.hpp>
// If compiling with CUDA then must be using CUDA 8 or better
// and use relocateable device code to enable the task policy.
// nvcc relocatable device code option: --relocatable-device-code=true
#if ( defined( KOKKOS_COMPILER_NVCC ) )
#if ( 8000 <= CUDA_VERSION ) && \
defined( KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE )
#define KOKKOS_ENABLE_TASKPOLICY
#endif
#else
#define KOKKOS_ENABLE_TASKPOLICY
#endif
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
#include <Kokkos_MemoryPool.hpp>
#include <impl/Kokkos_Tags.hpp>
#include <impl/Kokkos_TaskQueue.hpp>
//----------------------------------------------------------------------------
namespace Kokkos {
enum TaskType { TaskTeam = Impl::TaskBase<void,void,void>::TaskTeam
, TaskSingle = Impl::TaskBase<void,void,void>::TaskSingle };
enum TaskPriority { TaskHighPriority = 0
, TaskRegularPriority = 1
, TaskLowPriority = 2 };
template< typename Space >
class TaskPolicy ;
template< typename Space >
void wait( TaskPolicy< Space > const & );
} // namespace Kokkos
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
/*\brief Implementation data for task data management, access, and execution.
*
* CRTP Inheritance structure to allow static_cast from the
* task root type and a task's FunctorType.
*
* TaskBase< Space , ResultType , FunctorType >
* : TaskBase< Space , ResultType , void >
* , FunctorType
* { ... };
*
* TaskBase< Space , ResultType , void >
* : TaskBase< Space , void , void >
* { ... };
*/
template< typename Space , typename ResultType , typename FunctorType >
class TaskBase ;
template< typename Space >
class TaskExec ;
}} // namespace Kokkos::Impl
//----------------------------------------------------------------------------
namespace Kokkos {
/**
*
* Future< space > // value_type == void
* Future< value > // space == Default
* Future< value , space >
*
*/
template< typename Arg1 /* = void */ , typename Arg2 /* = void */ >
class Future {
private:
template< typename > friend class TaskPolicy ;
template< typename , typename > friend class Future ;
template< typename , typename , typename > friend class Impl::TaskBase ;
enum { Arg1_is_space = Kokkos::Impl::is_space< Arg1 >::value };
enum { Arg2_is_space = Kokkos::Impl::is_space< Arg2 >::value };
enum { Arg1_is_value = ! Arg1_is_space &&
! std::is_same< Arg1 , void >::value };
enum { Arg2_is_value = ! Arg2_is_space &&
! std::is_same< Arg2 , void >::value };
static_assert( ! ( Arg1_is_space && Arg2_is_space )
, "Future cannot be given two spaces" );
static_assert( ! ( Arg1_is_value && Arg2_is_value )
, "Future cannot be given two value types" );
using ValueType =
typename std::conditional< Arg1_is_value , Arg1 ,
typename std::conditional< Arg2_is_value , Arg2 , void
>::type >::type ;
using Space =
typename std::conditional< Arg1_is_space , Arg1 ,
typename std::conditional< Arg2_is_space , Arg2 , void
>::type >::type ;
using task_base = Impl::TaskBase< Space , ValueType , void > ;
using queue_type = Impl::TaskQueue< Space > ;
task_base * m_task ;
KOKKOS_INLINE_FUNCTION explicit
Future( task_base * task ) : m_task(0)
{ if ( task ) queue_type::assign( & m_task , task ); }
//----------------------------------------
public:
using execution_space = typename Space::execution_space ;
using value_type = ValueType ;
//----------------------------------------
KOKKOS_INLINE_FUNCTION
bool is_null() const { return 0 == m_task ; }
KOKKOS_INLINE_FUNCTION
int reference_count() const
{ return 0 != m_task ? m_task->reference_count() : 0 ; }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
~Future() { if ( m_task ) queue_type::assign( & m_task , (task_base*)0 ); }
//----------------------------------------
KOKKOS_INLINE_FUNCTION
constexpr Future() noexcept : m_task(0) {}
KOKKOS_INLINE_FUNCTION
Future( Future && rhs )
: m_task( rhs.m_task ) { rhs.m_task = 0 ; }
KOKKOS_INLINE_FUNCTION
Future( const Future & rhs )
: m_task(0)
{ if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task ); }
KOKKOS_INLINE_FUNCTION
Future & operator = ( Future && rhs )
{
if ( m_task ) queue_type::assign( & m_task , (task_base*)0 );
m_task = rhs.m_task ;
rhs.m_task = 0 ;
return *this ;
}
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future & rhs )
{
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
return *this ;
}
//----------------------------------------
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future( Future<A1,A2> && rhs )
: m_task( rhs.m_task )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
rhs.m_task = 0 ;
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future( const Future<A1,A2> & rhs )
: m_task(0)
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future & operator = ( const Future<A1,A2> & rhs )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( m_task || rhs.m_task ) queue_type::assign( & m_task , rhs.m_task );
return *this ;
}
template< class A1 , class A2 >
KOKKOS_INLINE_FUNCTION
Future & operator = ( Future<A1,A2> && rhs )
{
static_assert
( std::is_same< Space , void >::value ||
std::is_same< Space , typename Future<A1,A2>::Space >::value
, "Assigned Futures must have the same space" );
static_assert
( std::is_same< value_type , void >::value ||
std::is_same< value_type , typename Future<A1,A2>::value_type >::value
, "Assigned Futures must have the same value_type" );
if ( m_task ) queue_type::assign( & m_task , (task_base*) 0 );
m_task = rhs.m_task ;
rhs.m_task = 0 ;
return *this ;
}
//----------------------------------------
KOKKOS_INLINE_FUNCTION
typename task_base::get_return_type
get() const
{
if ( 0 == m_task ) {
Kokkos::abort( "Kokkos:::Future::get ERROR: is_null()");
}
return m_task->get();
}
};
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template< typename ExecSpace >
class TaskPolicy
{
private:
using track_type = Kokkos::Experimental::Impl::SharedAllocationTracker ;
using queue_type = Kokkos::Impl::TaskQueue< ExecSpace > ;
using task_base = Impl::TaskBase< ExecSpace , void , void > ;
track_type m_track ;
queue_type * m_queue ;
//----------------------------------------
// Process optional arguments to spawn and respawn functions
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const ) {}
// TaskTeam or TaskSingle
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskType const & arg
, Options const & ... opts )
{
task->m_task_type = arg ;
assign( task , opts ... );
}
// TaskHighPriority or TaskRegularPriority or TaskLowPriority
template< typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, TaskPriority const & arg
, Options const & ... opts )
{
task->m_priority = arg ;
assign( task , opts ... );
}
// Future for a dependence
template< typename A1 , typename A2 , typename ... Options >
KOKKOS_INLINE_FUNCTION static
void assign( task_base * const task
, Future< A1 , A2 > const & arg
, Options const & ... opts )
{
// Assign dependence to task->m_next
// which will be processed within subsequent call to schedule.
// Error if the dependence is reset.
if ( 0 != Kokkos::atomic_exchange(& task->m_next, arg.m_task) ) {
Kokkos::abort("TaskPolicy ERROR: resetting task dependence");
}
if ( 0 != arg.m_task ) {
// The future may be destroyed upon returning from this call
// so increment reference count to track this assignment.
Kokkos::atomic_fetch_add( &(arg.m_task->m_ref_count) , 1 );
}
assign( task , opts ... );
}
//----------------------------------------
public:
using execution_policy = TaskPolicy ;
using execution_space = ExecSpace ;
using memory_space = typename queue_type::memory_space ;
using member_type = Kokkos::Impl::TaskExec< ExecSpace > ;
KOKKOS_INLINE_FUNCTION
TaskPolicy() : m_track(), m_queue(0) {}
KOKKOS_INLINE_FUNCTION
TaskPolicy( TaskPolicy && rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy( TaskPolicy const & rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
KOKKOS_INLINE_FUNCTION
TaskPolicy & operator = ( TaskPolicy const & rhs ) = default ;
TaskPolicy( memory_space const & arg_memory_space
, unsigned const arg_memory_pool_capacity
, unsigned const arg_memory_pool_log2_superblock = 12 )
: m_track()
, m_queue(0)
{
typedef Kokkos::Experimental::Impl::SharedAllocationRecord
< memory_space , typename queue_type::Destroy >
record_type ;
record_type * record =
record_type::allocate( arg_memory_space
, "TaskQueue"
, sizeof(queue_type)
);
m_queue = new( record->data() )
queue_type( arg_memory_space
, arg_memory_pool_capacity
, arg_memory_pool_log2_superblock );
record->m_destroy.m_queue = m_queue ;
m_track.assign_allocated_record_to_uninitialized( record );
}
//----------------------------------------
/**\brief Allocation size for a spawned task */
template< typename FunctorType >
KOKKOS_FUNCTION
size_t spawn_allocation_size() const
{
using task_type = Impl::TaskBase< execution_space
, typename FunctorType::value_type
, FunctorType > ;
return m_queue->allocate_block_size( sizeof(task_type) );
}
/**\brief Allocation size for a when_all aggregate */
KOKKOS_FUNCTION
size_t when_all_allocation_size( int narg ) const
{
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
return m_queue->allocate_block_size( sizeof(task_base) + narg * sizeof(task_base*) );
}
//----------------------------------------
/**\brief A task spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
KOKKOS_FUNCTION
Future< typename FunctorType::value_type , ExecSpace >
task_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
{
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
//----------------------------------------
// Give single-thread back-ends an opportunity to clear
// queue of ready tasks before allocating a new task
m_queue->iff_single_thread_recursive_execute();
//----------------------------------------
future_type f ;
// Allocate task from memory pool
f.m_task =
reinterpret_cast< task_type * >(m_queue->allocate(sizeof(task_type)));
if ( f.m_task ) {
// Placement new construction
new ( f.m_task ) task_type( arg_functor );
// Reference count starts at two
// +1 for matching decrement when task is complete
// +1 for future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
assign( f.m_task , arg_options... );
// Spawning from within the execution space so the
// apply function pointer is guaranteed to be valid
f.m_task->m_apply = task_type::apply ;
m_queue->schedule( f.m_task );
// this task may be updated or executed at any moment
}
return f ;
}
/**\brief The host process spawns a task with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
* 3) Team or Serial
*/
template< typename FunctorType , typename ... Options >
inline
Future< typename FunctorType::value_type , ExecSpace >
host_spawn( FunctorType const & arg_functor
, Options const & ... arg_options
) const
{
using value_type = typename FunctorType::value_type ;
using future_type = Future< value_type , execution_space > ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
future_type f ;
// Allocate task from memory pool
f.m_task =
reinterpret_cast<task_type*>( m_queue->allocate(sizeof(task_type)) );
if ( f.m_task ) {
// Placement new construction
new( f.m_task ) task_type( arg_functor );
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = sizeof(task_type);
assign( f.m_task , arg_options... );
// Potentially spawning outside execution space so the
// apply function pointer must be obtained from execution space.
// Required for Cuda execution space function pointer.
queue_type::specialization::template
proc_set_apply< FunctorType >( & f.m_task->m_apply );
m_queue->schedule( f.m_task );
}
return f ;
}
/**\brief Return a future that is complete
* when all input futures are complete.
*/
template< typename A1 , typename A2 >
KOKKOS_FUNCTION
Future< ExecSpace >
when_all( int narg , Future< A1 , A2 > const * const arg ) const
{
static_assert
( std::is_same< execution_space
, typename Future< A1 , A2 >::execution_space
>::value
, "Future must have same execution space" );
using future_type = Future< ExecSpace > ;
using task_base = Kokkos::Impl::TaskBase< ExecSpace , void , void > ;
future_type f ;
size_t const size = sizeof(task_base) + narg * sizeof(task_base*);
f.m_task =
reinterpret_cast< task_base * >( m_queue->allocate( size ) );
if ( f.m_task ) {
new( f.m_task ) task_base();
// Reference count starts at two:
// +1 to match decrement when task completes
// +1 for the future
f.m_task->m_queue = m_queue ;
f.m_task->m_ref_count = 2 ;
f.m_task->m_alloc_size = size ;
f.m_task->m_dep_count = narg ;
f.m_task->m_task_type = task_base::Aggregate ;
task_base ** const dep = f.m_task->aggregate_dependences();
// Assign dependences to increment their reference count
// The futures may be destroyed upon returning from this call
// so increment reference count to track this assignment.
for ( int i = 0 ; i < narg ; ++i ) {
task_base * const t = dep[i] = arg[i].m_task ;
if ( 0 != t ) {
Kokkos::atomic_fetch_add( &(t->m_ref_count) , 1 );
}
}
m_queue->schedule( f.m_task );
// this when_all may be processed at any moment
}
return f ;
}
/**\brief An executing task respawns itself with options
*
* 1) High, Normal, or Low priority
* 2) With or without dependence
*/
template< class FunctorType , typename ... Options >
KOKKOS_FUNCTION
void respawn( FunctorType * task_self
, Options const & ... arg_options ) const
{
using value_type = typename FunctorType::value_type ;
using task_type = Impl::TaskBase< execution_space
, value_type
, FunctorType > ;
task_base * const zero = (task_base *) 0 ;
task_base * const lock = (task_base *) task_base::LockTag ;
task_type * const task = static_cast< task_type * >( task_self );
// Precondition:
// task is in Executing state
// therefore m_next == LockTag
//
// Change to m_next == 0 for no dependence
if ( lock != Kokkos::atomic_exchange( & task->m_next, zero ) ) {
Kokkos::abort("TaskPolicy::respawn ERROR: already respawned");
}
assign( task , arg_options... );
// Postcondition:
// task is in Executing-Respawn state
// therefore m_next == dependece or 0
}
//----------------------------------------
template< typename S >
friend
void Kokkos::wait( Kokkos::TaskPolicy< S > const & );
//----------------------------------------
inline
int allocation_capacity() const noexcept
{ return m_queue->m_memory.get_mem_size(); }
KOKKOS_INLINE_FUNCTION
int allocated_task_count() const noexcept
{ return m_queue->m_count_alloc ; }
KOKKOS_INLINE_FUNCTION
int allocated_task_count_max() const noexcept
{ return m_queue->m_max_alloc ; }
KOKKOS_INLINE_FUNCTION
long allocated_task_count_accum() const noexcept
{ return m_queue->m_accum_alloc ; }
};
template< typename ExecSpace >
inline
void wait( TaskPolicy< ExecSpace > const & policy )
{ policy.m_queue->execute(); }
} // namespace Kokkos
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
namespace Kokkos { namespace Kokkos {
@ -463,5 +1104,6 @@ void wait( TaskPolicy< ExecSpace > & );
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#endif /* #define KOKKOS_TASKPOLICY_HPP */ #endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_TASKPOLICY_HPP */

View File

@ -211,6 +211,8 @@ struct VerifyExecutionCanAccessMemorySpace
#include <Threads/Kokkos_ThreadsTeam.hpp> #include <Threads/Kokkos_ThreadsTeam.hpp>
#include <Threads/Kokkos_Threads_Parallel.hpp> #include <Threads/Kokkos_Threads_Parallel.hpp>
#include <KokkosExp_MDRangePolicy.hpp>
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

File diff suppressed because it is too large Load Diff

View File

@ -178,9 +178,10 @@ public:
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {
template< class FunctorType , class ... Traits > template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ...> , Kokkos::RangePolicy< Traits ...>
, ReducerType
, Kokkos::OpenMP , Kokkos::OpenMP
> >
{ {
@ -192,15 +193,21 @@ private:
typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType, WorkTag > ValueJoin ;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
template< class TagType > template< class TagType >
@ -252,7 +259,7 @@ public:
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
#pragma omp parallel #pragma omp parallel
{ {
@ -260,7 +267,7 @@ public:
const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() ); const WorkRange range( m_policy, exec.pool_rank(), exec.pool_size() );
ParallelReduce::template exec_range< WorkTag > ParallelReduce::template exec_range< WorkTag >
( m_functor , range.begin() , range.end() ( m_functor , range.begin() , range.end()
, ValueInit::init( m_functor , exec.scratch_reduce() ) ); , ValueInit::init( ReducerConditional::select(m_functor , m_reducer), exec.scratch_reduce() ) );
} }
/* END #pragma omp parallel */ /* END #pragma omp parallel */
@ -269,13 +276,13 @@ public:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
} }
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) { if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor ); const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
} }
@ -289,7 +296,7 @@ public:
OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_is_process("Kokkos::OpenMP parallel_reduce");
OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce"); OpenMPexec::verify_initialized("Kokkos::OpenMP parallel_reduce");
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
#pragma omp parallel #pragma omp parallel
{ {
@ -302,7 +309,7 @@ public:
long work_index = exec.get_work_index(); long work_index = exec.get_work_index();
reference_type update = ValueInit::init( m_functor , exec.scratch_reduce() ); reference_type update = ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() );
while(work_index != -1) { while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size(); const Member begin = static_cast<Member>(work_index) * m_policy.chunk_size();
const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end(); const Member end = begin + m_policy.chunk_size() < m_policy.end()?begin+m_policy.chunk_size():m_policy.end();
@ -319,13 +326,13 @@ public:
const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() ); const pointer_type ptr = pointer_type( OpenMPexec::pool_rev(0)->scratch_reduce() );
for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) { for ( int i = 1 ; i < OpenMPexec::pool_size() ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
} }
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) { if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor ); const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
} }
@ -337,18 +344,35 @@ public:
inline inline
ParallelReduce( const FunctorType & arg_functor ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy , Policy arg_policy
, const ViewType & arg_result_view ) , const ViewType & arg_result_view
, typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() ) , m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{ {
static_assert( Kokkos::is_view< ViewType >::value /*static_assert( std::is_same< typename ViewType::memory_space
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View" );
static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value , Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" ); , "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
} }
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
}; };
} // namespace Impl } // namespace Impl
@ -568,13 +592,13 @@ public:
const size_t team_reduce_size = Policy::member_type::team_reduce_size(); const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size ); OpenMPexec::resize_scratch( 0 , team_reduce_size + m_shmem_size + m_policy.scratch_size(1));
#pragma omp parallel #pragma omp parallel
{ {
ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type> ParallelFor::template exec_team< WorkTag, typename Policy::schedule_type::type>
( m_functor ( m_functor
, Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size) ); , Member( * OpenMPexec::get_thread_omp(), m_policy, m_shmem_size, 0) );
} }
/* END #pragma omp parallel */ /* END #pragma omp parallel */
} }
@ -584,14 +608,15 @@ public:
const Policy & arg_policy ) const Policy & arg_policy )
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{} {}
}; };
template< class FunctorType , class ... Properties > template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... > , Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::OpenMP , Kokkos::OpenMP
> >
{ {
@ -602,15 +627,19 @@ private:
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType , WorkTag > ValueTraits ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef Kokkos::Impl::FunctorValueInit< FunctorType , WorkTag > ValueInit ; typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , WorkTag > ValueJoin ;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
const int m_shmem_size ; const int m_shmem_size ;
@ -644,7 +673,7 @@ public:
const size_t team_reduce_size = Policy::member_type::team_reduce_size(); const size_t team_reduce_size = Policy::member_type::team_reduce_size();
OpenMPexec::resize_scratch( ValueTraits::value_size( m_functor ) , team_reduce_size + m_shmem_size ); OpenMPexec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , team_reduce_size + m_shmem_size );
#pragma omp parallel #pragma omp parallel
{ {
@ -652,8 +681,8 @@ public:
ParallelReduce::template exec_team< WorkTag > ParallelReduce::template exec_team< WorkTag >
( m_functor ( m_functor
, Member( exec , m_policy , m_shmem_size ) , Member( exec , m_policy , m_shmem_size, 0 )
, ValueInit::init( m_functor , exec.scratch_reduce() ) ); , ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , exec.scratch_reduce() ) );
} }
/* END #pragma omp parallel */ /* END #pragma omp parallel */
@ -665,13 +694,13 @@ public:
max_active_threads = m_policy.league_size()* m_policy.team_size(); max_active_threads = m_policy.league_size()* m_policy.team_size();
for ( int i = 1 ; i < max_active_threads ; ++i ) { for ( int i = 1 ; i < max_active_threads ; ++i ) {
ValueJoin::join( m_functor , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() ); ValueJoin::join( ReducerConditional::select(m_functor , m_reducer) , ptr , OpenMPexec::pool_rev(i)->scratch_reduce() );
} }
Kokkos::Impl::FunctorFinal< FunctorType , WorkTag >::final( m_functor , ptr ); Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr );
if ( m_result_ptr ) { if ( m_result_ptr ) {
const int n = ValueTraits::value_count( m_functor ); const int n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; } for ( int j = 0 ; j < n ; ++j ) { m_result_ptr[j] = ptr[j] ; }
} }
@ -682,12 +711,33 @@ public:
inline inline
ParallelReduce( const FunctorType & arg_functor , ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy , const Policy & arg_policy ,
const ViewType & arg_result ) const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() ) , m_result_ptr( arg_result.ptr_on_device() )
, m_shmem_size( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) , m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{} {}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shmem_size( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
}; };
} // namespace Impl } // namespace Impl

View File

@ -0,0 +1,329 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core.hpp>
#if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#include <impl/Kokkos_TaskQueue_impl.hpp>
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
TaskExec< Kokkos::OpenMP >::
TaskExec()
: m_self_exec( 0 )
, m_team_exec( 0 )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( 0 )
, m_team_rank( 0 )
, m_team_size( 1 )
{
}
TaskExec< Kokkos::OpenMP >::
TaskExec( Kokkos::Impl::OpenMPexec & arg_exec , int const arg_team_size )
: m_self_exec( & arg_exec )
, m_team_exec( arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size) )
, m_sync_mask( 0 )
, m_sync_value( 0 )
, m_sync_step( 0 )
, m_group_rank( arg_exec.pool_rank_rev() / arg_team_size )
, m_team_rank( arg_exec.pool_rank_rev() % arg_team_size )
, m_team_size( arg_team_size )
{
// This team spans
// m_self_exec->pool_rev( team_size * group_rank )
// m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 )
int64_t volatile * const sync = (int64_t *) m_self_exec->scratch_reduce();
sync[0] = int64_t(0) ;
sync[1] = int64_t(0) ;
for ( int i = 0 ; i < m_team_size ; ++i ) {
m_sync_value |= int64_t(1) << (8*i);
m_sync_mask |= int64_t(3) << (8*i);
}
Kokkos::memory_fence();
}
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void TaskExec< Kokkos::OpenMP >::team_barrier_impl() const
{
if ( m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t)) ) {
Kokkos::abort("TaskQueue<OpenMP> scratch_reduce memory too small");
}
// Use team shared memory to synchronize.
// Alternate memory locations between barriers to avoid a sequence
// of barriers overtaking one another.
int64_t volatile * const sync =
((int64_t *) m_team_exec->scratch_reduce()) + ( m_sync_step & 0x01 );
// This team member sets one byte within the sync variable
int8_t volatile * const sync_self =
((int8_t *) sync) + m_team_rank ;
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : before(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
*sync_self = int8_t( m_sync_value & 0x03 ); // signal arrival
while ( m_sync_value != *sync ); // wait for team to arrive
#if 0
fprintf( stdout
, "barrier group(%d) member(%d) step(%d) wait(%lx) : after(%lx)\n"
, m_group_rank
, m_team_rank
, m_sync_step
, m_sync_value
, *sync
);
fflush(stdout);
#endif
++m_sync_step ;
if ( 0 == ( 0x01 & m_sync_step ) ) { // Every other step
m_sync_value ^= m_sync_mask ;
if ( 1000 < m_sync_step ) m_sync_step = 0 ;
}
}
#endif
//----------------------------------------------------------------------------
void TaskQueueSpecialization< Kokkos::OpenMP >::execute
( TaskQueue< Kokkos::OpenMP > * const queue )
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
using Member = TaskExec< execution_space > ;
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
// Required: team_size <= 8
const int team_size = PoolExec::pool_size(2); // Threads per core
// const int team_size = PoolExec::pool_size(1); // Threads per NUMA
if ( 8 < team_size ) {
Kokkos::abort("TaskQueue<OpenMP> unsupported team size");
}
#pragma omp parallel
{
PoolExec & self = *PoolExec::get_thread_omp();
Member single_exec ;
Member team_exec( self , team_size );
// Team shared memory
task_root_type * volatile * const task_shared =
(task_root_type **) team_exec.m_team_exec->scratch_thread();
// Barrier across entire OpenMP thread pool to insure initialization
#pragma omp barrier
// Loop until all queues are empty and no tasks in flight
do {
task_root_type * task = 0 ;
// Each team lead attempts to acquire either a thread team task
// or a single thread task for the team.
if ( 0 == team_exec.team_rank() ) {
task = 0 < *((volatile int *) & queue->m_ready_count) ? end : 0 ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
}
// Team lead broadcast acquired task to team members:
if ( 1 < team_exec.team_size() ) {
if ( 0 == team_exec.team_rank() ) *task_shared = task ;
// Fence to be sure task_shared is stored before the barrier
Kokkos::memory_fence();
// Whole team waits for every team member to reach this statement
team_exec.team_barrier();
// Fence to be sure task_shared is stored
Kokkos::memory_fence();
task = *task_shared ;
}
#if 0
fprintf( stdout
, "\nexecute group(%d) member(%d) task_shared(0x%lx) task(0x%lx)\n"
, team_exec.m_group_rank
, team_exec.m_team_rank
, uintptr_t(task_shared)
, uintptr_t(task)
);
fflush(stdout);
#endif
if ( 0 == task ) break ; // 0 == m_ready_count
if ( end == task ) {
// All team members wait for whole team to reach this statement.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
else if ( task_root_type::TaskTeam == task->m_task_type ) {
// Thread Team Task
(*task->m_apply)( task , & team_exec );
// The m_apply function performs a barrier
if ( 0 == team_exec.team_rank() ) {
// team member #0 completes the task, which may delete the task
queue->complete( task );
}
}
else {
// Single Thread Task
if ( 0 == team_exec.team_rank() ) {
(*task->m_apply)( task , & single_exec );
queue->complete( task );
}
// All team members wait for whole team to reach this statement.
// Not necessary to complete the task.
// Is necessary to prevent task_shared from being updated
// before it is read by all threads.
team_exec.team_barrier();
}
} while(1);
}
// END #pragma omp parallel
}
void TaskQueueSpecialization< Kokkos::OpenMP >::
iff_single_thread_recursive_execute
( TaskQueue< Kokkos::OpenMP > * const queue )
{
using execution_space = Kokkos::OpenMP ;
using queue_type = TaskQueue< execution_space > ;
using task_root_type = TaskBase< execution_space , void , void > ;
using Member = TaskExec< execution_space > ;
if ( 1 == omp_get_num_threads() ) {
task_root_type * const end = (task_root_type *) task_root_type::EndTag ;
Member single_exec ;
task_root_type * task = end ;
do {
task = end ;
// Loop by priority and then type
for ( int i = 0 ; i < queue_type::NumQueue && end == task ; ++i ) {
for ( int j = 0 ; j < 2 && end == task ; ++j ) {
task = queue_type::pop_task( & queue->m_ready[i][j] );
}
}
if ( end == task ) break ;
(*task->m_apply)( task , & single_exec );
queue->complete( task );
} while(1);
}
}
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_OPENMP ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -0,0 +1,356 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP
#define KOKKOS_IMPL_OPENMP_TASK_HPP
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
namespace Impl {
template<>
class TaskQueueSpecialization< Kokkos::OpenMP >
{
public:
using execution_space = Kokkos::OpenMP ;
using queue_type = Kokkos::Impl::TaskQueue< execution_space > ;
using task_base_type = Kokkos::Impl::TaskBase< execution_space , void , void > ;
// Must specify memory space
using memory_space = Kokkos::HostSpace ;
static
void iff_single_thread_recursive_execute( queue_type * const );
// Must provide task queue execution function
static void execute( queue_type * const );
// Must provide mechanism to set function pointer in
// execution space from the host process.
template< typename FunctorType >
static
void proc_set_apply( task_base_type::function_type * ptr )
{
using TaskType = TaskBase< Kokkos::OpenMP
, typename FunctorType::value_type
, FunctorType
> ;
*ptr = TaskType::apply ;
}
};
extern template class TaskQueue< Kokkos::OpenMP > ;
//----------------------------------------------------------------------------
template<>
class TaskExec< Kokkos::OpenMP >
{
private:
TaskExec( TaskExec && ) = delete ;
TaskExec( TaskExec const & ) = delete ;
TaskExec & operator = ( TaskExec && ) = delete ;
TaskExec & operator = ( TaskExec const & ) = delete ;
using PoolExec = Kokkos::Impl::OpenMPexec ;
friend class Kokkos::Impl::TaskQueue< Kokkos::OpenMP > ;
friend class Kokkos::Impl::TaskQueueSpecialization< Kokkos::OpenMP > ;
PoolExec * const m_self_exec ; ///< This thread's thread pool data structure
PoolExec * const m_team_exec ; ///< Team thread's thread pool data structure
int64_t m_sync_mask ;
int64_t mutable m_sync_value ;
int mutable m_sync_step ;
int m_group_rank ; ///< Which "team" subset of thread pool
int m_team_rank ; ///< Which thread within a team
int m_team_size ;
TaskExec();
TaskExec( PoolExec & arg_exec , int arg_team_size );
void team_barrier_impl() const ;
public:
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
void * team_shared() const
{ return m_team_exec ? m_team_exec->scratch_thread() : (void*) 0 ; }
int team_shared_size() const
{ return m_team_exec ? m_team_exec->scratch_thread_size() : 0 ; }
/**\brief Whole team enters this function call
* before any teeam member returns from
* this function call.
*/
void team_barrier() const { if ( 1 < m_team_size ) team_barrier_impl(); }
#else
KOKKOS_INLINE_FUNCTION void team_barrier() const {}
KOKKOS_INLINE_FUNCTION void * team_shared() const { return 0 ; }
KOKKOS_INLINE_FUNCTION int team_shared_size() const { return 0 ; }
#endif
KOKKOS_INLINE_FUNCTION
int team_rank() const { return m_team_rank ; }
KOKKOS_INLINE_FUNCTION
int team_size() const { return m_team_size ; }
};
}} /* namespace Kokkos::Impl */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
namespace Kokkos {
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl::TaskExec< Kokkos::OpenMP > & thread
, const iType & count )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >(thread,count);
}
template<typename iType>
KOKKOS_INLINE_FUNCTION
Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >
TeamThreadRange
( Impl:: TaskExec< Kokkos::OpenMP > & thread
, const iType & start
, const iType & end )
{
return Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >(thread,start,end);
}
/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each i=0..N-1.
*
* The range i=0..N-1 is mapped to all threads of the the calling thread team.
* This functionality requires C++11 support.
*/
template<typename iType, class Lambda>
KOKKOS_INLINE_FUNCTION
void parallel_for
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
)
{
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i);
}
}
template<typename iType, class Lambda, typename ValueType>
KOKKOS_INLINE_FUNCTION
void parallel_reduce
( const Impl::TeamThreadRangeBoundariesStruct<iType,Impl:: TaskExec< Kokkos::OpenMP > >& loop_boundaries
, const Lambda& lambda
, ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
shared[0] += shared[i];
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
ValueType result = initialized_result;
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
lambda(i, result);
}
if ( 1 < loop_boundaries.thread.team_size() ) {
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
loop_boundaries.thread.team_barrier();
shared[team_rank] = result;
loop_boundaries.thread.team_barrier();
// reduce across threads to thread 0
if (team_rank == 0) {
for (int i = 1; i < loop_boundaries.thread.team_size(); i++) {
join(shared[0], shared[i]);
}
}
loop_boundaries.thread.team_barrier();
// broadcast result
initialized_result = shared[0];
}
else {
initialized_result = result ;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
ValueType& initialized_result)
{
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType, class JoinType >
KOKKOS_INLINE_FUNCTION
void parallel_reduce
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda,
const JoinType & join,
ValueType& initialized_result)
{
}
template< typename ValueType, typename iType, class Lambda >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::TeamThreadRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
ValueType accum = 0 ;
ValueType val, local_total;
ValueType *shared = (ValueType*) loop_boundaries.thread.team_shared();
int team_size = loop_boundaries.thread.team_size();
int team_rank = loop_boundaries.thread.team_rank(); // member num within the team
// Intra-member scan
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
shared[team_rank] = accum;
loop_boundaries.thread.team_barrier();
// Member 0 do scan on accumulated totals
if (team_rank == 0) {
for( iType i = 1; i < team_size; i+=1) {
shared[i] += shared[i-1];
}
accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan
}
loop_boundaries.thread.team_barrier();
// Inter-member scan adding in accumulated totals
if (team_rank != 0) { accum = shared[team_rank-1]; }
for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) {
local_total = 0;
lambda(i,local_total,false);
val = accum;
lambda(i,val,true);
accum += local_total;
}
}
// placeholder for future function
template< typename iType, class Lambda, typename ValueType >
KOKKOS_INLINE_FUNCTION
void parallel_scan
(const Impl::ThreadVectorRangeBoundariesStruct<iType,Impl::TaskExec< Kokkos::OpenMP > >& loop_boundaries,
const Lambda & lambda)
{
}
} /* namespace Kokkos */
//----------------------------------------------------------------------------
//----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */

View File

@ -49,6 +49,7 @@
#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Error.hpp>
#include <iostream> #include <iostream>
#include <impl/Kokkos_CPUDiscovery.hpp> #include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
#ifdef KOKKOS_HAVE_OPENMP #ifdef KOKKOS_HAVE_OPENMP
@ -85,16 +86,8 @@ int OpenMPexec::m_map_rank[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
int OpenMPexec::m_pool_topo[ 4 ] = { 0 }; int OpenMPexec::m_pool_topo[ 4 ] = { 0 };
#if ! KOKKOS_USING_EXP_VIEW
OpenMPexec::Pool OpenMPexec::m_pool;
#else
OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 }; OpenMPexec * OpenMPexec::m_pool[ OpenMPexec::MAX_THREAD_COUNT ] = { 0 };
#endif
void OpenMPexec::verify_is_process( const char * const label ) void OpenMPexec::verify_is_process( const char * const label )
{ {
if ( omp_in_parallel() ) { if ( omp_in_parallel() ) {
@ -125,16 +118,12 @@ void OpenMPexec::clear_scratch()
#pragma omp parallel #pragma omp parallel
{ {
const int rank_rev = m_map_rank[ omp_get_thread_num() ]; const int rank_rev = m_map_rank[ omp_get_thread_num() ];
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_pool[ rank_rev ] ) { if ( m_pool[ rank_rev ] ) {
Record * const r = Record::get_record( m_pool[ rank_rev ] ); Record * const r = Record::get_record( m_pool[ rank_rev ] );
m_pool[ rank_rev ] = 0 ; m_pool[ rank_rev ] = 0 ;
Record::decrement( r ); Record::decrement( r );
} }
#else
m_pool.at(rank_rev).clear();
#endif
} }
/* END #pragma omp parallel */ /* END #pragma omp parallel */
} }
@ -172,8 +161,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
const int rank_rev = m_map_rank[ omp_get_thread_num() ]; const int rank_rev = m_map_rank[ omp_get_thread_num() ];
const int rank = pool_size - ( rank_rev + 1 ); const int rank = pool_size - ( rank_rev + 1 );
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
Record * const r = Record::allocate( Kokkos::HostSpace() Record * const r = Record::allocate( Kokkos::HostSpace()
@ -184,15 +171,6 @@ void OpenMPexec::resize_scratch( size_t reduce_size , size_t thread_size )
m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() ); m_pool[ rank_rev ] = reinterpret_cast<OpenMPexec*>( r->data() );
#else
#pragma omp critical
{
m_pool.at(rank_rev) = HostSpace::allocate_and_track( "openmp_scratch", alloc_size );
}
#endif
new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size ); new ( m_pool[ rank_rev ] ) OpenMPexec( rank , ALLOC_EXEC , reduce_size , thread_size );
} }
/* END #pragma omp parallel */ /* END #pragma omp parallel */
@ -330,6 +308,10 @@ void OpenMP::initialize( unsigned thread_count ,
} }
// Init the array for used for arbitrarily sized atomics // Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space(); Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -350,6 +332,10 @@ void OpenMP::finalize()
if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) { if ( Impl::s_using_hwloc && Kokkos::hwloc::can_bind_threads() ) {
hwloc::unbind_this_thread(); hwloc::unbind_this_thread();
} }
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -46,7 +46,6 @@
#include <impl/Kokkos_Traits.hpp> #include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_spinwait.hpp> #include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp> #include <Kokkos_Atomic.hpp>
#include <iostream> #include <iostream>
@ -63,38 +62,10 @@ public:
enum { MAX_THREAD_COUNT = 4096 }; enum { MAX_THREAD_COUNT = 4096 };
#if ! KOKKOS_USING_EXP_VIEW
struct Pool
{
Pool() : m_trackers() {}
AllocationTracker m_trackers[ MAX_THREAD_COUNT ];
OpenMPexec * operator[](int i)
{
return reinterpret_cast<OpenMPexec *>(m_trackers[i].alloc_ptr());
}
AllocationTracker & at(int i)
{
return m_trackers[i];
}
};
private:
static Pool m_pool; // Indexed by: m_pool_rank_rev
#else
private: private:
static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev static OpenMPexec * m_pool[ MAX_THREAD_COUNT ]; // Indexed by: m_pool_rank_rev
#endif
static int m_pool_topo[ 4 ]; static int m_pool_topo[ 4 ];
static int m_map_rank[ MAX_THREAD_COUNT ]; static int m_map_rank[ MAX_THREAD_COUNT ];
@ -145,6 +116,12 @@ public:
inline long team_work_index() const { return m_team_work_index ; } inline long team_work_index() const { return m_team_work_index ; }
inline int scratch_reduce_size() const
{ return m_scratch_reduce_end - m_scratch_exec_end ; }
inline int scratch_thread_size() const
{ return m_scratch_thread_end - m_scratch_reduce_end ; }
inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; } inline void * scratch_reduce() const { return ((char *) this) + m_scratch_exec_end ; }
inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; } inline void * scratch_thread() const { return ((char *) this) + m_scratch_reduce_end ; }
@ -157,15 +134,15 @@ public:
~OpenMPexec() {} ~OpenMPexec() {}
OpenMPexec( const int poolRank OpenMPexec( const int arg_poolRank
, const int scratch_exec_size , const int arg_scratch_exec_size
, const int scratch_reduce_size , const int arg_scratch_reduce_size
, const int scratch_thread_size ) , const int arg_scratch_thread_size )
: m_pool_rank( poolRank ) : m_pool_rank( arg_poolRank )
, m_pool_rank_rev( pool_size() - ( poolRank + 1 ) ) , m_pool_rank_rev( pool_size() - ( arg_poolRank + 1 ) )
, m_scratch_exec_end( scratch_exec_size ) , m_scratch_exec_end( arg_scratch_exec_size )
, m_scratch_reduce_end( m_scratch_exec_end + scratch_reduce_size ) , m_scratch_reduce_end( m_scratch_exec_end + arg_scratch_reduce_size )
, m_scratch_thread_end( m_scratch_reduce_end + scratch_thread_size ) , m_scratch_thread_end( m_scratch_reduce_end + arg_scratch_thread_size )
, m_barrier_state(0) , m_barrier_state(0)
{} {}
@ -330,7 +307,7 @@ public:
Impl::OpenMPexec & m_exec ; Impl::OpenMPexec & m_exec ;
scratch_memory_space m_team_shared ; scratch_memory_space m_team_shared ;
int m_team_shmem ; int m_team_scratch_size[2] ;
int m_team_base_rev ; int m_team_base_rev ;
int m_team_rank_rev ; int m_team_rank_rev ;
int m_team_rank ; int m_team_rank ;
@ -378,15 +355,15 @@ public:
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_shmem() const const execution_space::scratch_memory_space& team_shmem() const
{ return m_team_shared.set_team_thread_mode(1,0) ; } { return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& team_scratch(int) const const execution_space::scratch_memory_space& team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(1,0) ; } { return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space& thread_scratch(int) const const execution_space::scratch_memory_space& thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -568,11 +545,12 @@ public:
inline inline
OpenMPexecTeamMember( Impl::OpenMPexec & exec OpenMPexecTeamMember( Impl::OpenMPexec & exec
, const TeamPolicyInternal< OpenMP, Properties ...> & team , const TeamPolicyInternal< OpenMP, Properties ...> & team
, const int shmem_size , const int shmem_size_L1
, const int shmem_size_L2
) )
: m_exec( exec ) : m_exec( exec )
, m_team_shared(0,0) , m_team_shared(0,0)
, m_team_shmem( shmem_size ) , m_team_scratch_size{ shmem_size_L1 , shmem_size_L2 }
, m_team_base_rev(0) , m_team_base_rev(0)
, m_team_rank_rev(0) , m_team_rank_rev(0)
, m_team_rank(0) , m_team_rank(0)
@ -580,7 +558,7 @@ public:
, m_league_rank(0) , m_league_rank(0)
, m_league_end(0) , m_league_end(0)
, m_league_size( team.league_size() ) , m_league_size( team.league_size() )
, m_chunk_size( team.chunk_size() ) , m_chunk_size( team.chunk_size()>0?team.chunk_size():team.team_iter() )
, m_league_chunk_end(0) , m_league_chunk_end(0)
, m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) )) , m_team_lead_exec( *exec.pool_rev( team.team_alloc() * (m_exec.pool_rank_rev()/team.team_alloc()) ))
, m_team_alloc( team.team_alloc()) , m_team_alloc( team.team_alloc())
@ -589,10 +567,9 @@ public:
const int pool_team_rank_rev = pool_rank_rev % team.team_alloc(); const int pool_team_rank_rev = pool_rank_rev % team.team_alloc();
const int pool_league_rank_rev = pool_rank_rev / team.team_alloc(); const int pool_league_rank_rev = pool_rank_rev / team.team_alloc();
const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc(); const int pool_num_teams = OpenMP::thread_pool_size(0)/team.team_alloc();
const int chunk_size = team.chunk_size()>0?team.chunk_size():team.team_iter(); const int chunks_per_team = ( team.league_size() + m_chunk_size*pool_num_teams-1 ) / (m_chunk_size*pool_num_teams);
const int chunks_per_team = ( team.league_size() + chunk_size*pool_num_teams-1 ) / (chunk_size*pool_num_teams); int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * m_chunk_size;
int league_iter_end = team.league_size() - pool_league_rank_rev * chunks_per_team * chunk_size; int league_iter_begin = league_iter_end - chunks_per_team * m_chunk_size;
int league_iter_begin = league_iter_end - chunks_per_team * chunk_size;
if (league_iter_begin < 0) league_iter_begin = 0; if (league_iter_begin < 0) league_iter_begin = 0;
if (league_iter_end>team.league_size()) league_iter_end = team.league_size(); if (league_iter_end>team.league_size()) league_iter_end = team.league_size();
@ -611,7 +588,9 @@ public:
m_team_rank = m_team_size - ( m_team_rank_rev + 1 ); m_team_rank = m_team_size - ( m_team_rank_rev + 1 );
m_league_end = league_iter_end ; m_league_end = league_iter_end ;
m_league_rank = league_iter_begin ; m_league_rank = league_iter_begin ;
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0 );
} }
if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) { if ( (m_team_rank_rev == 0) && (m_invalid_thread == 0) ) {
@ -627,10 +606,13 @@ public:
void next_static() void next_static()
{ {
if ( ++m_league_rank < m_league_end ) { if ( m_league_rank < m_league_end ) {
team_barrier(); team_barrier();
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
} }
m_league_rank++;
} }
bool valid_dynamic() { bool valid_dynamic() {
@ -661,10 +643,13 @@ public:
if(m_invalid_thread) if(m_invalid_thread)
return; return;
if ( m_league_rank < m_league_chunk_end ) {
team_barrier(); team_barrier();
if ( ++m_league_rank < m_league_chunk_end ) { new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_scratch_size[0] ,
new( (void*) &m_team_shared ) space( ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE , m_team_shmem ); ( (char*) m_exec.pool_rev(m_team_base_rev)->scratch_thread() ) + TEAM_REDUCE_SIZE + m_team_scratch_size[0],
0);
} }
m_league_rank++;
} }
static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; } static inline int team_reduce_size() { return TEAM_REDUCE_SIZE ; }
@ -687,8 +672,10 @@ public:
m_team_size = p.m_team_size; m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc; m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter; m_team_iter = p.m_team_iter;
m_team_scratch_size = p.m_team_scratch_size; m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size = p.m_thread_scratch_size; m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size; m_chunk_size = p.m_chunk_size;
return *this; return *this;
} }
@ -719,8 +706,8 @@ private:
int m_team_alloc ; int m_team_alloc ;
int m_team_iter ; int m_team_iter ;
size_t m_team_scratch_size; size_t m_team_scratch_size[2];
size_t m_thread_scratch_size; size_t m_thread_scratch_size[2];
int m_chunk_size; int m_chunk_size;
@ -753,15 +740,19 @@ public:
inline int team_size() const { return m_team_size ; } inline int team_size() const { return m_team_size ; }
inline int league_size() const { return m_league_size ; } inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; } inline size_t scratch_size(const int& level, int team_size_ = -1) const {
if(team_size_ < 0)
team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
/** \brief Specify league size, request team size */ /** \brief Specify league size, request team size */
TeamPolicyInternal( typename traits::execution_space & TeamPolicyInternal( typename traits::execution_space &
, int league_size_request , int league_size_request
, int team_size_request , int team_size_request
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init( league_size_request , team_size_request ); } { init( league_size_request , team_size_request ); }
@ -769,24 +760,24 @@ public:
, int league_size_request , int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */ , const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1) , int /* vector_length_request */ = 1)
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); } { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
TeamPolicyInternal( int league_size_request TeamPolicyInternal( int league_size_request
, int team_size_request , int team_size_request
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init( league_size_request , team_size_request ); } { init( league_size_request , team_size_request ); }
TeamPolicyInternal( int league_size_request TeamPolicyInternal( int league_size_request
, const Kokkos::AUTO_t & /* team_size_request */ , const Kokkos::AUTO_t & /* team_size_request */
, int /* vector_length_request */ = 1 ) , int /* vector_length_request */ = 1 )
: m_team_scratch_size ( 0 ) : m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init( league_size_request , traits::execution_space::thread_pool_size(2) ); } { init( league_size_request , traits::execution_space::thread_pool_size(2) ); }
@ -803,24 +794,21 @@ public:
} }
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
return p; return p;
}; };
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };

View File

@ -104,7 +104,7 @@ namespace Kokkos {
int Qthread::is_initialized() int Qthread::is_initialized()
{ {
Impl::s_number_workers != 0 ; return Impl::s_number_workers != 0 ;
} }
int Qthread::concurrency() int Qthread::concurrency()

View File

@ -145,11 +145,13 @@ public:
//---------------------------------------- //----------------------------------------
/** Reduce across all workers participating in the 'exec_all' */ /** Reduce across all workers participating in the 'exec_all' */
template< class FunctorType , class ArgTag > template< class FunctorType , class ReducerType , class ArgTag >
inline inline
void exec_all_reduce( const FunctorType & func ) const void exec_all_reduce( const FunctorType & func, const ReducerType & reduce ) const
{ {
typedef Kokkos::Impl::FunctorValueJoin< FunctorType , ArgTag > ValueJoin ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, ArgTag > ValueJoin ;
const int rev_rank = m_worker_size - ( m_worker_rank + 1 ); const int rev_rank = m_worker_size - ( m_worker_rank + 1 );
@ -160,7 +162,7 @@ public:
Impl::spinwait( fan.m_worker_state , QthreadExec::Active ); Impl::spinwait( fan.m_worker_state , QthreadExec::Active );
ValueJoin::join( func , m_scratch_alloc , fan.m_scratch_alloc ); ValueJoin::join( ReducerConditional::select(func , reduce) , m_scratch_alloc , fan.m_scratch_alloc );
} }
if ( rev_rank ) { if ( rev_rank ) {

View File

@ -130,9 +130,10 @@ public:
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
template< class FunctorType , class ... Traits > template< class FunctorType , class ReducerType , class ... Traits >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... > , Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Qthread , Kokkos::Qthread
> >
{ {
@ -141,17 +142,23 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType, ReducerType>::value, FunctorType, ReducerType > ReducerConditional;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef typename ReducerConditional::type ReducerTypeFwd;
// Static Assert WorkTag void if ReducerType not InvalidType
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
template< class TagType > template< class TagType >
@ -187,9 +194,10 @@ private:
ParallelReduce::template exec_range< WorkTag >( ParallelReduce::template exec_range< WorkTag >(
self.m_functor, range.begin(), range.end(), self.m_functor, range.begin(), range.end(),
ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) ); ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer)
, exec.exec_all_reduce_value() ) );
exec.template exec_all_reduce<FunctorType, WorkTag >( self.m_functor ); exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
} }
public: public:
@ -197,26 +205,39 @@ public:
inline inline
void execute() const void execute() const
{ {
QthreadExec::resize_worker_scratch( ValueTraits::value_size( m_functor ) , 0 ); QthreadExec::resize_worker_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this ); Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data ); Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data );
if ( m_result_ptr ) { if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor ); const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
} }
} }
template< class HostViewType > template< class ViewType >
ParallelReduce( const FunctorType & arg_functor ParallelReduce( const FunctorType & arg_functor
, const Policy & arg_policy , const Policy & arg_policy
, const HostViewType & arg_result_view ) , const ViewType & arg_result_view
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type< ReducerType >::value
, void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_result_ptr( arg_result_view.ptr_on_device() ) , m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.data() )
{ }
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{ } { }
}; };
@ -291,10 +312,12 @@ public:
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
template< class FunctorType , class ... Properties > template< class FunctorType , class ReducerType , class ... Properties >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, TeamPolicy< Properties... > , TeamPolicy< Properties... >
, Kokkos::Qthread > , ReducerType
, Kokkos::Qthread
>
{ {
private: private:
@ -303,14 +326,18 @@ private:
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
template< class TagType > template< class TagType >
@ -345,9 +372,10 @@ private:
ParallelReduce::template exec_team< WorkTag > ParallelReduce::template exec_team< WorkTag >
( self.m_functor ( self.m_functor
, Member( exec , self.m_policy ) , Member( exec , self.m_policy )
, ValueInit::init( self.m_functor , exec.exec_all_reduce_value() ) ); , ValueInit::init( ReducerConditional::select( self.m_functor , self.m_reducer )
, exec.exec_all_reduce_value() ) );
exec.template exec_all_reduce< FunctorType , WorkTag >( self.m_functor ); exec.template exec_all_reduce< FunctorType, ReducerType, WorkTag >( self.m_functor, self.m_reducer );
} }
public: public:
@ -356,29 +384,43 @@ public:
void execute() const void execute() const
{ {
QthreadExec::resize_worker_scratch QthreadExec::resize_worker_scratch
( /* reduction memory */ ValueTraits::value_size( m_functor ) ( /* reduction memory */ ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) )
, /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) ); , /* team shared memory */ FunctorTeamShmemSize< FunctorType >::value( m_functor , m_policy.team_size() ) );
Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this ); Impl::QthreadExec::exec_all( Qthread::instance() , & ParallelReduce::exec , this );
const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result(); const pointer_type data = (pointer_type) QthreadExec::exec_all_reduce_result();
Kokkos::Impl::FunctorFinal< FunctorType , typename Policy::work_tag >::final( m_functor , data ); Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data );
if ( m_result_ptr ) { if ( m_result_ptr ) {
const unsigned n = ValueTraits::value_count( m_functor ); const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
} }
} }
template< class ViewType > template< class ViewType >
ParallelReduce( const FunctorType & arg_functor , ParallelReduce( const FunctorType & arg_functor
const Policy & arg_policy , , const Policy & arg_policy
const ViewType & arg_result ) , const ViewType & arg_result
, typename std::enable_if<Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type< ReducerType >::value
, void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() ) , m_result_ptr( arg_result.ptr_on_device() )
{ } { }
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{ }
}; };
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -395,8 +437,8 @@ private:
typedef Kokkos::RangePolicy< Traits ... > Policy ; typedef Kokkos::RangePolicy< Traits ... > Policy ;
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ;
typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ;

View File

@ -58,6 +58,8 @@
#include <Kokkos_Atomic.hpp> #include <Kokkos_Atomic.hpp>
#include <Qthread/Kokkos_Qthread_TaskPolicy.hpp> #include <Qthread/Kokkos_Qthread_TaskPolicy.hpp>
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
namespace Kokkos { namespace Kokkos {
@ -122,8 +124,8 @@ Task::~TaskMember()
Task::TaskMember( const function_verify_type arg_verify Task::TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc , const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -145,8 +147,8 @@ Task::TaskMember( const function_verify_type arg_verify
} }
Task::TaskMember( const function_dealloc_type arg_dealloc Task::TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -316,12 +318,8 @@ aligned_t Task::qthread_func( void * arg )
, int(Kokkos::Experimental::TASK_STATE_EXECUTING) , int(Kokkos::Experimental::TASK_STATE_EXECUTING)
); );
// It is a single thread's responsibility to close out
// this task's execution.
bool close_out = false ;
if ( task->m_apply_team && ! task->m_apply_single ) { if ( task->m_apply_team && ! task->m_apply_single ) {
const Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ; Kokkos::Impl::QthreadTeamPolicyMember::TaskTeam task_team_tag ;
// Initialize team size and rank with shephered info // Initialize team size and rank with shephered info
Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag ); Kokkos::Impl::QthreadTeamPolicyMember member( task_team_tag );
@ -344,7 +342,7 @@ fflush(stdout);
if ( member.team_rank() == 0 ) task->closeout(); if ( member.team_rank() == 0 ) task->closeout();
member.team_barrier(); member.team_barrier();
} }
else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_apply_single_type>(1) ) { else if ( task->m_apply_team && task->m_apply_single == reinterpret_cast<function_single_type>(1) ) {
// Team hard-wired to one, no cloning // Team hard-wired to one, no cloning
Kokkos::Impl::QthreadTeamPolicyMember member ; Kokkos::Impl::QthreadTeamPolicyMember member ;
(*task->m_apply_team)( task , member ); (*task->m_apply_team)( task , member );
@ -488,5 +486,6 @@ void wait( Kokkos::Experimental::TaskPolicy< Kokkos::Qthread > & policy )
} // namespace Experimental } // namespace Experimental
} // namespace Kokkos } // namespace Kokkos
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */ #endif /* #if defined( KOKKOS_HAVE_QTHREAD ) */

View File

@ -69,6 +69,8 @@
#include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_FunctorAdapter.hpp>
#if defined( KOKKOS_ENABLE_TASKPOLICY )
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
namespace Kokkos { namespace Kokkos {
@ -80,17 +82,17 @@ class TaskMember< Kokkos::Qthread , void , void >
{ {
public: public:
typedef void (* function_apply_single_type) ( TaskMember * );
typedef void (* function_apply_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
typedef TaskMember * (* function_verify_type) ( TaskMember * ); typedef TaskMember * (* function_verify_type) ( TaskMember * );
typedef void (* function_single_type) ( TaskMember * );
typedef void (* function_team_type) ( TaskMember * , Kokkos::Impl::QthreadTeamPolicyMember & );
typedef void (* function_dealloc_type)( TaskMember * );
private: private:
const function_dealloc_type m_dealloc ; ///< Deallocation const function_dealloc_type m_dealloc ; ///< Deallocation
const function_verify_type m_verify ; ///< Result type verification const function_verify_type m_verify ; ///< Result type verification
const function_apply_single_type m_apply_single ; ///< Apply function const function_single_type m_apply_single ; ///< Apply function
const function_apply_team_type m_apply_team ; ///< Apply function const function_team_type m_apply_team ; ///< Apply function
int volatile * const m_active_count ; ///< Count of active tasks on this policy int volatile * const m_active_count ; ///< Count of active tasks on this policy
aligned_t m_qfeb ; ///< Qthread full/empty bit aligned_t m_qfeb ; ///< Qthread full/empty bit
TaskMember ** const m_dep ; ///< Dependences TaskMember ** const m_dep ; ///< Dependences
@ -130,8 +132,8 @@ protected :
// Used by TaskMember< Qthread , ResultType , void > // Used by TaskMember< Qthread , ResultType , void >
TaskMember( const function_verify_type arg_verify TaskMember( const function_verify_type arg_verify
, const function_dealloc_type arg_dealloc , const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -139,8 +141,8 @@ protected :
// Used for TaskMember< Qthread , void , void > // Used for TaskMember< Qthread , void , void >
TaskMember( const function_dealloc_type arg_dealloc TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -221,7 +223,7 @@ public:
typedef typename DerivedTaskType::functor_type functor_type ; typedef typename DerivedTaskType::functor_type functor_type ;
typedef typename functor_type::value_type value_type ; typedef typename functor_type::value_type value_type ;
const function_apply_single_type flag = reinterpret_cast<function_apply_single_type>( arg_is_team ? 0 : 1 ); const function_single_type flag = reinterpret_cast<function_single_type>( arg_is_team ? 0 : 1 );
DerivedTaskType * const task = DerivedTaskType * const task =
new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) ) new( allocate( sizeof(DerivedTaskType) , arg_dependence_capacity ) )
@ -379,13 +381,13 @@ protected:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ; typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ; typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ; typedef task_root_type::function_team_type function_team_type ;
inline inline
TaskMember( const function_dealloc_type arg_dealloc TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -413,13 +415,13 @@ public:
typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ; typedef TaskMember< Kokkos::Qthread , void , void > task_root_type ;
typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ; typedef TaskMember< Kokkos::Qthread , ResultType , void > task_base_type ;
typedef task_root_type::function_dealloc_type function_dealloc_type ; typedef task_root_type::function_dealloc_type function_dealloc_type ;
typedef task_root_type::function_apply_single_type function_apply_single_type ; typedef task_root_type::function_single_type function_single_type ;
typedef task_root_type::function_apply_team_type function_apply_team_type ; typedef task_root_type::function_team_type function_team_type ;
inline inline
TaskMember( const function_dealloc_type arg_dealloc TaskMember( const function_dealloc_type arg_dealloc
, const function_apply_single_type arg_apply_single , const function_single_type arg_apply_single
, const function_apply_team_type arg_apply_team , const function_team_type arg_apply_team
, volatile int & arg_active_count , volatile int & arg_active_count
, const unsigned arg_sizeof_derived , const unsigned arg_sizeof_derived
, const unsigned arg_dependence_capacity , const unsigned arg_dependence_capacity
@ -453,6 +455,7 @@ class TaskPolicy< Kokkos::Qthread >
public: public:
typedef Kokkos::Qthread execution_space ; typedef Kokkos::Qthread execution_space ;
typedef TaskPolicy execution_policy ;
typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ; typedef Kokkos::Impl::QthreadTeamPolicyMember member_type ;
private: private:
@ -489,14 +492,17 @@ public:
, const unsigned arg_task_team_size = 0 /* choose default */ , const unsigned arg_task_team_size = 0 /* choose default */
); );
TaskPolicy() = default ; KOKKOS_FUNCTION TaskPolicy() = default ;
TaskPolicy( TaskPolicy && rhs ) = default ; KOKKOS_FUNCTION TaskPolicy( TaskPolicy && rhs ) = default ;
TaskPolicy( const TaskPolicy & rhs ) = default ; KOKKOS_FUNCTION TaskPolicy( const TaskPolicy & rhs ) = default ;
TaskPolicy & operator = ( TaskPolicy && rhs ) = default ; KOKKOS_FUNCTION TaskPolicy & operator = ( TaskPolicy && rhs ) = default ;
TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ; KOKKOS_FUNCTION TaskPolicy & operator = ( const TaskPolicy & rhs ) = default ;
//---------------------------------------- //----------------------------------------
KOKKOS_INLINE_FUNCTION
int allocated_task_count() const { return m_active_count ; }
template< class ValueType > template< class ValueType >
const Future< ValueType , execution_space > & const Future< ValueType , execution_space > &
spawn( const Future< ValueType , execution_space > & f spawn( const Future< ValueType , execution_space > & f
@ -653,5 +659,6 @@ public:
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #define KOKKOS_QTHREAD_TASK_HPP */ #endif /* #define KOKKOS_QTHREAD_TASK_HPP */

View File

@ -3,26 +3,23 @@
# Cloning repository and branch: # Cloning repository and branch:
git clone https://github.com/stelleg/qthreads qthreads-with-clone git clone git@github.com:Qthreads/qthreads.git qthreads
cd qthreads-with-clone cd qthreads
# Added to ./git/config # checkout branch with "cloned tasks"
#
# [branch "cloned_tasks"]
# remote = origin
# merge = refs/heads/cloned_tasks
#
git branch cloned_tasks git checkout dev-kokkos
git checkout cloned_tasks
git pull # Configure/autogen
sh autogen.sh sh autogen.sh
# configurure with 'hwloc' installation: # configure with 'hwloc' installation:
./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR} ./configure CFLAGS="-DCLONED_TASKS -DQTHREAD_LOCAL_PRIORITY" --with-hwloc=${HWLOCDIR} --prefix=${INSTALLDIR}
# install
make install

View File

@ -53,6 +53,7 @@
#include <Kokkos_Core.hpp> #include <Kokkos_Core.hpp>
#include <impl/Kokkos_Error.hpp> #include <impl/Kokkos_Error.hpp>
#include <impl/Kokkos_CPUDiscovery.hpp> #include <impl/Kokkos_CPUDiscovery.hpp>
#include <impl/Kokkos_Profiling_Interface.hpp>
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -134,11 +135,7 @@ void ThreadsExec::driver(void)
ThreadsExec::ThreadsExec() ThreadsExec::ThreadsExec()
: m_pool_base(0) : m_pool_base(0)
#if ! KOKKOS_USING_EXP_VIEW
, m_scratch()
#else
, m_scratch(0) , m_scratch(0)
#endif
, m_scratch_reduce_end(0) , m_scratch_reduce_end(0)
, m_scratch_thread_end(0) , m_scratch_thread_end(0)
, m_numa_rank(0) , m_numa_rank(0)
@ -198,8 +195,6 @@ ThreadsExec::~ThreadsExec()
{ {
const unsigned entry = m_pool_size - ( m_pool_rank + 1 ); const unsigned entry = m_pool_size - ( m_pool_rank + 1 );
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( m_scratch ) { if ( m_scratch ) {
@ -210,12 +205,6 @@ ThreadsExec::~ThreadsExec()
Record::decrement( r ); Record::decrement( r );
} }
#else
m_scratch.clear();
#endif
m_pool_base = 0 ; m_pool_base = 0 ;
m_scratch_reduce_end = 0 ; m_scratch_reduce_end = 0 ;
m_scratch_thread_end = 0 ; m_scratch_thread_end = 0 ;
@ -439,8 +428,6 @@ void * ThreadsExec::root_reduce_scratch()
void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * ) void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
{ {
#if KOKKOS_USING_EXP_VIEW
typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ; typedef Kokkos::Experimental::Impl::SharedAllocationRecord< Kokkos::HostSpace , void > Record ;
if ( exec.m_scratch ) { if ( exec.m_scratch ) {
@ -451,19 +438,11 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
Record::decrement( r ); Record::decrement( r );
} }
#else
exec.m_scratch.clear();
#endif
exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ; exec.m_scratch_reduce_end = s_threads_process.m_scratch_reduce_end ;
exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ; exec.m_scratch_thread_end = s_threads_process.m_scratch_thread_end ;
if ( s_threads_process.m_scratch_thread_end ) { if ( s_threads_process.m_scratch_thread_end ) {
#if KOKKOS_USING_EXP_VIEW
// Allocate tracked memory: // Allocate tracked memory:
{ {
Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end ); Record * const r = Record::allocate( Kokkos::HostSpace() , "thread_scratch" , s_threads_process.m_scratch_thread_end );
@ -475,15 +454,6 @@ void ThreadsExec::execute_resize_scratch( ThreadsExec & exec , const void * )
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch ); unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch );
#else
exec.m_scratch =
HostSpace::allocate_and_track( "thread_scratch" , s_threads_process.m_scratch_thread_end );
unsigned * ptr = reinterpret_cast<unsigned *>( exec.m_scratch.alloc_ptr() );
#endif
unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned); unsigned * const end = ptr + s_threads_process.m_scratch_thread_end / sizeof(unsigned);
// touch on this thread // touch on this thread
@ -520,11 +490,7 @@ void * ThreadsExec::resize_scratch( size_t reduce_size , size_t thread_size )
s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ; s_threads_process.m_scratch = s_threads_exec[0]->m_scratch ;
} }
#if KOKKOS_USING_EXP_VIEW
return s_threads_process.m_scratch ; return s_threads_process.m_scratch ;
#else
return s_threads_process.m_scratch.alloc_ptr() ;
#endif
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -758,6 +724,9 @@ void ThreadsExec::initialize( unsigned thread_count ,
// Init the array for used for arbitrarily sized atomics // Init the array for used for arbitrarily sized atomics
Impl::init_lock_array_host_space(); Impl::init_lock_array_host_space();
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::initialize();
#endif
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -807,6 +776,10 @@ void ThreadsExec::finalize()
s_threads_process.m_pool_size = 1 ; s_threads_process.m_pool_size = 1 ;
s_threads_process.m_pool_fan_size = 0 ; s_threads_process.m_pool_fan_size = 0 ;
s_threads_process.m_pool_state = ThreadsExec::Inactive ; s_threads_process.m_pool_state = ThreadsExec::Inactive ;
#if (KOKKOS_ENABLE_PROFILING)
Kokkos::Profiling::finalize();
#endif
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -49,7 +49,6 @@
#include <utility> #include <utility>
#include <impl/Kokkos_spinwait.hpp> #include <impl/Kokkos_spinwait.hpp>
#include <impl/Kokkos_FunctorAdapter.hpp> #include <impl/Kokkos_FunctorAdapter.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <Kokkos_Atomic.hpp> #include <Kokkos_Atomic.hpp>
@ -89,11 +88,7 @@ private:
ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in ThreadsExec * const * m_pool_base ; ///< Base for pool fan-in
#if ! KOKKOS_USING_EXP_VIEW
Impl::AllocationTracker m_scratch ;
#else
void * m_scratch ; void * m_scratch ;
#endif
int m_scratch_reduce_end ; int m_scratch_reduce_end ;
int m_scratch_thread_end ; int m_scratch_thread_end ;
int m_numa_rank ; int m_numa_rank ;
@ -138,19 +133,10 @@ public:
static int get_thread_count(); static int get_thread_count();
static ThreadsExec * get_thread( const int init_thread_rank ); static ThreadsExec * get_thread( const int init_thread_rank );
#if ! KOKKOS_USING_EXP_VIEW
inline void * reduce_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()); }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const { return reinterpret_cast<unsigned char *>(m_scratch.alloc_ptr()) + m_scratch_reduce_end ; }
#else
inline void * reduce_memory() const { return m_scratch ; } inline void * reduce_memory() const { return m_scratch ; }
KOKKOS_INLINE_FUNCTION void * scratch_memory() const KOKKOS_INLINE_FUNCTION void * scratch_memory() const
{ return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; } { return reinterpret_cast<unsigned char *>(m_scratch) + m_scratch_reduce_end ; }
#endif
KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; } KOKKOS_INLINE_FUNCTION int volatile & state() { return m_pool_state ; }
KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; } KOKKOS_INLINE_FUNCTION ThreadsExec * const * pool_base() const { return m_pool_base ; }

View File

@ -129,15 +129,15 @@ public:
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_shmem() const const execution_space::scratch_memory_space & team_shmem() const
{ return m_team_shared.set_team_thread_mode(1,0) ; } { return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & team_scratch(int) const const execution_space::scratch_memory_space & team_scratch(int) const
{ return m_team_shared.set_team_thread_mode(1,0) ; } { return m_team_shared.set_team_thread_mode(0,1,0) ; }
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
const execution_space::scratch_memory_space & thread_scratch(int) const const execution_space::scratch_memory_space & thread_scratch(int) const
{ return m_team_shared.set_team_thread_mode(team_size(),team_rank()) ; } { return m_team_shared.set_team_thread_mode(0,team_size(),team_rank()) ; }
KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; } KOKKOS_INLINE_FUNCTION int league_rank() const { return m_league_rank ; }
KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; } KOKKOS_INLINE_FUNCTION int league_size() const { return m_league_size ; }
@ -433,10 +433,11 @@ public:
void next_static() void next_static()
{ {
if ( ++m_league_rank < m_league_end ) { if ( m_league_rank < m_league_end ) {
team_barrier(); team_barrier();
set_team_shared(); set_team_shared();
} }
m_league_rank++;
} }
bool valid_dynamic() { bool valid_dynamic() {
@ -468,10 +469,11 @@ public:
if(m_invalid_thread) if(m_invalid_thread)
return; return;
if ( m_league_rank < m_league_chunk_end ) {
team_barrier(); team_barrier();
if ( ++m_league_rank < m_league_chunk_end ) {
set_team_shared(); set_team_shared();
} }
m_league_rank++;
} }
void set_league_shmem( const int arg_league_rank void set_league_shmem( const int arg_league_rank
@ -504,8 +506,8 @@ private:
int m_team_alloc ; int m_team_alloc ;
int m_team_iter ; int m_team_iter ;
size_t m_team_scratch_size; size_t m_team_scratch_size[2];
size_t m_thread_scratch_size; size_t m_thread_scratch_size[2];
int m_chunk_size; int m_chunk_size;
@ -549,8 +551,10 @@ public:
m_team_size = p.m_team_size; m_team_size = p.m_team_size;
m_team_alloc = p.m_team_alloc; m_team_alloc = p.m_team_alloc;
m_team_iter = p.m_team_iter; m_team_iter = p.m_team_iter;
m_team_scratch_size = p.m_team_scratch_size; m_team_scratch_size[0] = p.m_team_scratch_size[0];
m_thread_scratch_size = p.m_thread_scratch_size; m_thread_scratch_size[0] = p.m_thread_scratch_size[0];
m_team_scratch_size[1] = p.m_team_scratch_size[1];
m_thread_scratch_size[1] = p.m_thread_scratch_size[1];
m_chunk_size = p.m_chunk_size; m_chunk_size = p.m_chunk_size;
return *this; return *this;
} }
@ -577,7 +581,12 @@ public:
inline int team_size() const { return m_team_size ; } inline int team_size() const { return m_team_size ; }
inline int team_alloc() const { return m_team_alloc ; } inline int team_alloc() const { return m_team_alloc ; }
inline int league_size() const { return m_league_size ; } inline int league_size() const { return m_league_size ; }
inline size_t scratch_size() const { return m_team_scratch_size + m_team_size*m_thread_scratch_size ; } inline size_t scratch_size(const int& level, int team_size_ = -1 ) const {
if(team_size_ < 0)
team_size_ = m_team_size;
return m_team_scratch_size[level] + team_size_*m_thread_scratch_size[level] ;
}
inline int team_iter() const { return m_team_iter ; } inline int team_iter() const { return m_team_iter ; }
/** \brief Specify league size, request team size */ /** \brief Specify league size, request team size */
@ -588,8 +597,8 @@ public:
: m_league_size(0) : m_league_size(0)
, m_team_size(0) , m_team_size(0)
, m_team_alloc(0) , m_team_alloc(0)
, m_team_scratch_size ( 0 ) , m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init(league_size_request,team_size_request); (void) vector_length_request; } { init(league_size_request,team_size_request); (void) vector_length_request; }
@ -601,8 +610,8 @@ public:
: m_league_size(0) : m_league_size(0)
, m_team_size(0) , m_team_size(0)
, m_team_alloc(0) , m_team_alloc(0)
, m_team_scratch_size ( 0 ) , m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); } { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -612,8 +621,8 @@ public:
: m_league_size(0) : m_league_size(0)
, m_team_size(0) , m_team_size(0)
, m_team_alloc(0) , m_team_alloc(0)
, m_team_scratch_size ( 0 ) , m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init(league_size_request,team_size_request); } { init(league_size_request,team_size_request); }
@ -623,8 +632,8 @@ public:
: m_league_size(0) : m_league_size(0)
, m_team_size(0) , m_team_size(0)
, m_team_alloc(0) , m_team_alloc(0)
, m_team_scratch_size ( 0 ) , m_team_scratch_size { 0 , 0 }
, m_thread_scratch_size ( 0 ) , m_thread_scratch_size { 0 , 0 }
, m_chunk_size(0) , m_chunk_size(0)
{ init(league_size_request,traits::execution_space::thread_pool_size(2)); } { init(league_size_request,traits::execution_space::thread_pool_size(2)); }
@ -639,26 +648,23 @@ public:
/** \brief set per team scratch size for a specific level of the scratch hierarchy */ /** \brief set per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
return p; return p;
}; };
/** \brief set per thread scratch size for a specific level of the scratch hierarchy */ /** \brief set per thread scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };
/** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */ /** \brief set per thread and per team scratch size for a specific level of the scratch hierarchy */
inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const { inline TeamPolicyInternal set_scratch_size(const int& level, const PerTeamValue& per_team, const PerThreadValue& per_thread) const {
(void) level;
TeamPolicyInternal p = *this; TeamPolicyInternal p = *this;
p.m_team_scratch_size = per_team.value; p.m_team_scratch_size[level] = per_team.value;
p.m_thread_scratch_size = per_thread.value; p.m_thread_scratch_size[level] = per_thread.value;
return p; return p;
}; };

View File

@ -264,7 +264,7 @@ public:
, const Policy & arg_policy ) , const Policy & arg_policy )
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ } { }
}; };
@ -272,9 +272,10 @@ public:
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and RangePolicy */ /* ParallelReduce with Kokkos::Threads and RangePolicy */
template< class FunctorType , class ... Traits > template< class FunctorType , class ReducerType, class ... Traits >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::RangePolicy< Traits ... > , Kokkos::RangePolicy< Traits ... >
, ReducerType
, Kokkos::Threads , Kokkos::Threads
> >
{ {
@ -286,14 +287,18 @@ private:
typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::WorkRange WorkRange ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
template< class TagType > template< class TagType >
@ -344,9 +349,9 @@ private:
ParallelReduce::template exec_range< WorkTag > ParallelReduce::template exec_range< WorkTag >
( self.m_functor , range.begin() , range.end() ( self.m_functor , range.begin() , range.end()
, ValueInit::init( self.m_functor , exec.reduce_memory() ) ); , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
} }
template<class Schedule> template<class Schedule>
@ -362,7 +367,7 @@ private:
exec.barrier(); exec.barrier();
long work_index = exec.get_work_index(); long work_index = exec.get_work_index();
reference_type update = ValueInit::init( self.m_functor , exec.reduce_memory() ); reference_type update = ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() );
while(work_index != -1) { while(work_index != -1) {
const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size(); const Member begin = static_cast<Member>(work_index) * self.m_policy.chunk_size();
const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end(); const Member end = begin + self.m_policy.chunk_size() < self.m_policy.end()?begin+self.m_policy.chunk_size():self.m_policy.end();
@ -372,7 +377,7 @@ private:
work_index = exec.get_work_index(); work_index = exec.get_work_index();
} }
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
} }
public: public:
@ -380,7 +385,7 @@ public:
inline inline
void execute() const void execute() const
{ {
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , 0 ); ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , 0 );
ThreadsExec::start( & ParallelReduce::exec , this ); ThreadsExec::start( & ParallelReduce::exec , this );
@ -391,7 +396,7 @@ public:
const pointer_type data = const pointer_type data =
(pointer_type) ThreadsExec::root_reduce_scratch(); (pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor ); const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
} }
} }
@ -399,9 +404,14 @@ public:
template< class HostViewType > template< class HostViewType >
ParallelReduce( const FunctorType & arg_functor , ParallelReduce( const FunctorType & arg_functor ,
const Policy & arg_policy , const Policy & arg_policy ,
const HostViewType & arg_result_view ) const HostViewType & arg_result_view ,
typename std::enable_if<
Kokkos::is_view< HostViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result_view.ptr_on_device() ) , m_result_ptr( arg_result_view.ptr_on_device() )
{ {
static_assert( Kokkos::is_view< HostViewType >::value static_assert( Kokkos::is_view< HostViewType >::value
@ -410,14 +420,30 @@ public:
static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value static_assert( std::is_same< typename HostViewType::memory_space , HostSpace >::value
, "Kokkos::Threads reduce result must be a View in HostSpace" ); , "Kokkos::Threads reduce result must be a View in HostSpace" );
} }
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
}; };
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
/* ParallelReduce with Kokkos::Threads and TeamPolicy */ /* ParallelReduce with Kokkos::Threads and TeamPolicy */
template< class FunctorType , class ... Properties > template< class FunctorType , class ReducerType, class ... Properties >
class ParallelReduce< FunctorType class ParallelReduce< FunctorType
, Kokkos::TeamPolicy< Properties ... > , Kokkos::TeamPolicy< Properties ... >
, ReducerType
, Kokkos::Threads , Kokkos::Threads
> >
{ {
@ -426,14 +452,19 @@ private:
typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... > Policy ; typedef Kokkos::Impl::TeamPolicyInternal< Kokkos::Threads, Properties ... > Policy ;
typedef typename Policy::work_tag WorkTag ; typedef typename Policy::work_tag WorkTag ;
typedef typename Policy::member_type Member ; typedef typename Policy::member_type Member ;
typedef Kokkos::Impl::FunctorValueTraits< FunctorType, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< FunctorType, WorkTag > ValueInit ; typedef Kokkos::Impl::if_c< std::is_same<InvalidType,ReducerType>::value, FunctorType, ReducerType> ReducerConditional;
typedef typename ReducerConditional::type ReducerTypeFwd;
typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ;
typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ;
typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::pointer_type pointer_type ;
typedef typename ValueTraits::reference_type reference_type ; typedef typename ValueTraits::reference_type reference_type ;
const FunctorType m_functor ; const FunctorType m_functor ;
const Policy m_policy ; const Policy m_policy ;
const ReducerType m_reducer ;
const pointer_type m_result_ptr ; const pointer_type m_result_ptr ;
const int m_shared ; const int m_shared ;
@ -464,9 +495,9 @@ private:
ParallelReduce::template exec_team< WorkTag > ParallelReduce::template exec_team< WorkTag >
( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) ( self.m_functor , Member( & exec , self.m_policy , self.m_shared )
, ValueInit::init( self.m_functor , exec.reduce_memory() ) ); , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) );
exec.template fan_in_reduce< FunctorType , WorkTag >( self.m_functor ); exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) );
} }
public: public:
@ -474,7 +505,7 @@ public:
inline inline
void execute() const void execute() const
{ {
ThreadsExec::resize_scratch( ValueTraits::value_size( m_functor ) , Policy::member_type::team_reduce_size() + m_shared ); ThreadsExec::resize_scratch( ValueTraits::value_size( ReducerConditional::select(m_functor , m_reducer) ) , Policy::member_type::team_reduce_size() + m_shared );
ThreadsExec::start( & ParallelReduce::exec , this ); ThreadsExec::start( & ParallelReduce::exec , this );
@ -484,20 +515,41 @@ public:
const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch(); const pointer_type data = (pointer_type) ThreadsExec::root_reduce_scratch();
const unsigned n = ValueTraits::value_count( m_functor ); const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) );
for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; } for ( unsigned i = 0 ; i < n ; ++i ) { m_result_ptr[i] = data[i]; }
} }
} }
template< class ViewType > template< class ViewType >
ParallelReduce( const FunctorType & arg_functor inline
, const Policy & arg_policy ParallelReduce( const FunctorType & arg_functor ,
, const ViewType & arg_result ) const Policy & arg_policy ,
const ViewType & arg_result ,
typename std::enable_if<
Kokkos::is_view< ViewType >::value &&
!Kokkos::is_reducer_type<ReducerType>::value
,void*>::type = NULL)
: m_functor( arg_functor ) : m_functor( arg_functor )
, m_policy( arg_policy ) , m_policy( arg_policy )
, m_reducer( InvalidType() )
, m_result_ptr( arg_result.ptr_on_device() ) , m_result_ptr( arg_result.ptr_on_device() )
, m_shared( arg_policy.scratch_size() + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) ) , m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{ } {}
inline
ParallelReduce( const FunctorType & arg_functor
, Policy arg_policy
, const ReducerType& reducer )
: m_functor( arg_functor )
, m_policy( arg_policy )
, m_reducer( reducer )
, m_result_ptr( reducer.result_view().data() )
, m_shared( arg_policy.scratch_size(0) + arg_policy.scratch_size(1) + FunctorTeamShmemSize< FunctorType >::value( arg_functor , arg_policy.team_size() ) )
{
/*static_assert( std::is_same< typename ViewType::memory_space
, Kokkos::HostSpace >::value
, "Reduction result on Kokkos::OpenMP must be a Kokkos::View in HostSpace" );*/
}
}; };
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -46,9 +46,10 @@
#include <stdio.h> #include <stdio.h>
#include <iostream> #include <iostream>
#include <sstream> #include <sstream>
#include <Kokkos_Core.hpp>
#include <Threads/Kokkos_Threads_TaskPolicy.hpp> #include <Threads/Kokkos_Threads_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD ) #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
#define QLOCK (reinterpret_cast<void*>( ~((uintptr_t)0) )) #define QLOCK (reinterpret_cast<void*>( ~((uintptr_t)0) ))
#define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 )) #define QDENIED (reinterpret_cast<void*>( ~((uintptr_t)0) - 1 ))
@ -87,9 +88,8 @@ ThreadsTaskPolicyQueue::ThreadsTaskPolicyQueue
, const unsigned arg_task_team_size , const unsigned arg_task_team_size
) )
: m_space( Kokkos::Threads::memory_space() : m_space( Kokkos::Threads::memory_space()
, arg_task_max_size , arg_task_max_size * arg_task_max_count * 1.2
, arg_task_max_size * arg_task_max_count , 16 /* log2(superblock size) */
, 1 /* only one level of memory pool */
) )
, m_team { 0 , 0 , 0 } , m_team { 0 , 0 , 0 }
, m_serial { 0 , 0 , 0 } , m_serial { 0 , 0 , 0 }
@ -624,10 +624,10 @@ ThreadsTaskPolicyQueue::allocate_task
// User created task memory pool with an estimate, // User created task memory pool with an estimate,
// if estimate is to low then report and throw exception. // if estimate is to low then report and throw exception.
if ( m_space.get_min_chunk_size() < size_alloc ) { if ( m_space.get_min_block_size() < size_alloc ) {
fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n" fprintf(stderr,"TaskPolicy<Threads> task allocation requires %d bytes on memory pool with %d byte chunk size\n"
, int(size_alloc) , int(size_alloc)
, int(m_space.get_min_chunk_size()) , int(m_space.get_min_block_size())
); );
fflush(stderr); fflush(stderr);
Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate"); Kokkos::Impl::throw_runtime_exception("TaskMember< Threads >::task_allocate");
@ -926,5 +926,5 @@ void Task::clear_dependence()
} /* namespace Experimental */ } /* namespace Experimental */
} /* namespace Kokkos */ } /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */ #endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */

View File

@ -50,7 +50,7 @@
#include <Kokkos_Threads.hpp> #include <Kokkos_Threads.hpp>
#include <Kokkos_TaskPolicy.hpp> #include <Kokkos_TaskPolicy.hpp>
#if defined( KOKKOS_HAVE_PTHREAD ) #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY )
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
@ -737,10 +737,9 @@ public:
} /* namespace Experimental */ } /* namespace Experimental */
} /* namespace Kokkos */ } /* namespace Kokkos */
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) */
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#endif /* #if defined( KOKKOS_HAVE_PTHREAD ) && defined( KOKKOS_ENABLE_TASKPOLICY ) */
#endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */ #endif /* #ifndef KOKKOS_THREADS_TASKPOLICY_HPP */

View File

@ -246,8 +246,8 @@ private:
enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul }; enum : uintptr_t { DO_NOT_DEREF_FLAG = 0x01ul };
// The allocation record resides in Host memory space // The allocation record resides in Host memory space
Record * m_record ;
uintptr_t m_record_bits ; uintptr_t m_record_bits ;
Record * m_record ;
public: public:

View File

@ -47,8 +47,6 @@
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
#if KOKKOS_USING_EXP_VIEW
namespace Kokkos { namespace Kokkos {
/* For backward compatibility */ /* For backward compatibility */
@ -68,8 +66,6 @@ struct ViewAllocateWithoutInitializing {
} /* namespace Kokkos */ } /* namespace Kokkos */
#endif
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -2604,19 +2604,25 @@ class ViewMapping< DstTraits , SrcTraits ,
&& &&
std::is_same< typename DstTraits::specialize , void >::value std::is_same< typename DstTraits::specialize , void >::value
&& &&
std::is_same< typename SrcTraits::specialize , void >::value
&&
(
std::is_same< typename DstTraits::array_layout , typename SrcTraits::array_layout >::value
||
(
( (
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value || std::is_same< typename DstTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value || std::is_same< typename DstTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value std::is_same< typename DstTraits::array_layout , Kokkos::LayoutStride >::value
) )
&& &&
std::is_same< typename SrcTraits::specialize , void >::value
&&
( (
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value || std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutLeft >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value || std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutRight >::value ||
std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value std::is_same< typename SrcTraits::array_layout , Kokkos::LayoutStride >::value
) )
)
)
)>::type > )>::type >
{ {
private: private:

View File

@ -1,848 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_Core_fwd.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST )
#include <Kokkos_Atomic.hpp>
#include <impl/Kokkos_Singleton.hpp>
#include <impl/Kokkos_AllocationTracker.hpp>
#include <impl/Kokkos_Error.hpp>
#include <string>
#include <vector>
#include <sstream>
#include <algorithm>
#include <utility>
#include <cstdlib>
#include <cstring>
#include <iostream>
#include <iomanip>
/* Enable clean up of memory leaks */
#define CLEAN_UP_MEMORY_LEAKS 0
namespace Kokkos { namespace Impl {
namespace {
//-----------------------------------------------------------------------------
// AllocationRecord
//-----------------------------------------------------------------------------
//
// Used to track details about an allocation and provide a ref count
// sizeof(AllocationRecord) == 128
struct AllocationRecord
{
enum {
OFFSET = sizeof(AllocatorBase*) // allocator
+ sizeof(void*) // alloc_ptr
+ sizeof(uint64_t) // alloc_size
+ sizeof(AllocatorAttributeBase*) // attribute
+ sizeof(uint32_t) // node_index
+ sizeof(uint32_t) // ref_count
, LABEL_LENGTH = 128 - OFFSET
};
AllocatorBase * const allocator;
void * const alloc_ptr;
const uint64_t alloc_size;
AllocatorAttributeBase * const attribute;
const int32_t node_index;
volatile uint32_t ref_count;
const char label[LABEL_LENGTH];
AllocationRecord( AllocatorBase * const arg_allocator
, void * arg_alloc_ptr
, uint64_t arg_alloc_size
, int32_t arg_node_index
, const std::string & arg_label
)
: allocator(arg_allocator)
, alloc_ptr(arg_alloc_ptr)
, alloc_size(arg_alloc_size)
, attribute(NULL)
, node_index(arg_node_index)
, ref_count(1)
, label() // zero fill
{
const size_t length = static_cast<size_t>(LABEL_LENGTH-1u) < arg_label.size() ? static_cast<size_t>(LABEL_LENGTH-1u) : arg_label.size();
strncpy( const_cast<char *>(label), arg_label.c_str(), length );
}
~AllocationRecord()
{
if (attribute) {
delete attribute;
}
}
uint32_t increment_ref_count()
{
uint32_t old_value = atomic_fetch_add( &ref_count, static_cast<uint32_t>(1) );
return old_value + 1u;
}
uint32_t decrement_ref_count()
{
uint32_t old_value = atomic_fetch_sub( &ref_count, static_cast<uint32_t>(1) );
return old_value - 1u;
}
void print( std::ostream & oss ) const
{
oss << "{ " << allocator->name()
<< " } : \"" << label
<< "\" ref_count(" << ref_count
<< ") memory[ " << alloc_ptr
<< " + " << alloc_size
<< " ]" ;
}
bool set_attribute( AllocatorAttributeBase * attr )
{
bool result = false;
if (attribute == NULL) {
result = NULL == atomic_compare_exchange( const_cast<AllocatorAttributeBase **>(&attribute)
, reinterpret_cast<AllocatorAttributeBase *>(NULL)
, attr );
}
return result;
}
// disallow copy and assignment
AllocationRecord( const AllocationRecord & );
AllocationRecord & operator=(const AllocationRecord &);
};
template <int NumBlocks>
struct Bitset
{
enum { blocks = NumBlocks };
enum { size = blocks * 64 };
enum { block_mask = 63u };
enum { block_shift = 6 };
// used to find free bits in a bitset
static int count_trailing_zeros(uint64_t x)
{
#if defined( KOKKOS_COMPILER_GNU ) || defined( KOKKOS_COMPILER_CLANG ) || defined( KOKKOS_COMPILER_APPLECC )
return x ? __builtin_ctzll(x) : 64;
#elif defined( KOKKOS_COMPILER_INTEL )
enum { shift = 32 };
enum { mask = (static_cast<uint64_t>(1) << shift) - 1u };
return (x & mask) ? _bit_scan_forward(static_cast<int>(x & mask)) :
(x >> shift) ? shift + _bit_scan_forward(static_cast<int>(x >> shift)) :
64 ;
#elif defined( KOKKOS_COMPILER_IBM )
return x ? __cnttz8(x) : 64;
#else
int i = 0;
for (; ((x & (static_cast<uint64_t>(1) << i)) == 0u) && i < 64; ++i ) {}
return i;
#endif
}
Bitset()
: m_bits()
{
for (int i=0; i < blocks; ++i) {
m_bits[i] = 0u;
}
}
bool set( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return !( atomic_fetch_or( m_bits + (i >> block_shift), bit ) & bit );
}
bool reset( int i )
{
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return atomic_fetch_and( m_bits + (i >> block_shift), ~bit ) & bit;
}
bool test( int i )
{
const uint64_t block = m_bits[ i >> block_shift ];
const uint64_t bit = static_cast<uint64_t>(1) << ( i & block_mask );
return block & bit;
}
int find_first_unset() const
{
for (int i=0; i < blocks; ++i) {
const uint64_t block = m_bits[i];
int b = count_trailing_zeros( ~block );
if ( b < 64 ) {
return (i << block_shift) + b;
}
}
return size;
}
volatile uint64_t m_bits[blocks];
};
//-----------------------------------------------------------------------------
// AllocationRecordPool -- singleton class
//
// global_alloc_rec_pool is the ONLY instance of this class
//
//-----------------------------------------------------------------------------
// Record AllocationRecords in a lock-free circular list.
// Each node in the list has a buffer with space for 959 ((15*64)-1) records
// managed by a bitset. Atomics are used to set and reset bits in the bit set.
// The head of the list is atomically updated to the last node found with
// unused space.
//
// Cost time to create an allocation record: amortized O(1), worst case O(num nodes)
// Cost to destroy an allocation recored: O(1)
//
// Singleton allocations are pushed onto a lock-free stack that is destroyed
// after the circular list of allocation records.
struct AllocationRecordPool
{
enum { BITSET_BLOCKS = 15 };
typedef Bitset<BITSET_BLOCKS> bitset_type;
enum { BUFFER_SIZE = (bitset_type::size - 1) * sizeof(AllocationRecord) };
struct AllocationNode
{
AllocationNode()
: next()
, bitset()
, buffer()
{
// set the first bit to used
bitset.set(0);
}
void * get_buffer( int32_t node_index )
{
return buffer + (node_index-1) * sizeof(AllocationRecord);
}
// return 0 if no space is available in the node
int32_t get_node_index()
{
int32_t node_index = 0;
do {
node_index = bitset.find_first_unset();
// successfully claimed a bit
if ( node_index != bitset.size && bitset.set(node_index) )
{
return node_index;
}
} while ( node_index != bitset.size );
return 0;
}
void clear_node_index( int32_t node_index )
{
bitset.reset(node_index);
}
AllocationNode * next;
bitset_type bitset;
char buffer[BUFFER_SIZE];
};
struct SingletonNode
{
void * buffer;
SingletonNode * next;
Impl::singleton_destroy_function_type destroy;
SingletonNode( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
: buffer(NULL)
, next(NULL)
, destroy(destroy_func)
{
if (size) {
buffer = malloc(size);
create_func(buffer);
}
}
~SingletonNode()
{
if (buffer) {
try {
destroy(buffer);
} catch(...) {}
free(buffer);
}
}
};
AllocationRecordPool()
: head( new AllocationNode() )
, singleton_head(NULL)
{
// setup ring
head->next = head;
}
~AllocationRecordPool()
{
// delete allocation records
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
#if defined( KOKKOS_DEBUG_PRINT_ALLOCATION_BITSET )
// print node bitset
for (int i=0; i < bitset_type::blocks; ++i ) {
std::cout << std::hex << std::showbase << curr->bitset.m_bits[i] << " ";
}
std::cout << std::endl;
#endif
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
#if CLEAN_UP_MEMORY_LEAKS
/* Cleaning up memory leaks prevents memory error detection tools
* from reporting the original source of allocation, which can
* impede debugging with such tools.
*/
try {
destroy(alloc_rec);
}
catch(...) {}
#endif
}
}
curr->next = NULL;
delete curr;
curr = next;
} while ( curr != start );
//if ( !string_vec.empty() ) {
// std::sort( string_vec.begin(), string_vec.end() );
//
// std::ostringstream oss;
// oss << "Error: Allocation pool destroyed with the following memory leak(s):\n";
// for (size_t i=0; i< string_vec.size(); ++i)
// {
// oss << " " << string_vec[i] << std::endl;
// }
//
// std::cerr << oss.str() << std::endl;
//}
}
// delete singletons
{
SingletonNode * curr = singleton_head;
while (curr) {
SingletonNode * next = curr->next;
delete curr;
curr = next;
}
}
}
AllocationRecord * create( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
AllocationNode * start = volatile_load(&head);
AllocationNode * curr = start;
int32_t node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
while (node_index == 0 && curr != start)
{
node_index = curr->get_node_index();
if (node_index == 0) {
curr = volatile_load(&curr->next);
}
}
// Need to allocate and insert a new node
if (node_index == 0 && curr == start)
{
AllocationNode * new_node = new AllocationNode();
node_index = new_node->get_node_index();
AllocationNode * next = NULL;
do {
next = volatile_load(&curr->next);
new_node->next = next;
memory_fence();
} while ( next != atomic_compare_exchange( &(curr->next), next, new_node ) );
curr = new_node;
}
void * buffer = curr->get_buffer(node_index);
// try to set head to curr
if ( start != curr )
{
atomic_compare_exchange( & head, start, curr );
}
return new (buffer) AllocationRecord( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, node_index
, arg_label
);
}
void destroy( AllocationRecord * alloc_rec )
{
if (alloc_rec) {
const int32_t node_index = alloc_rec->node_index;
AllocationNode * node = get_node( alloc_rec );
// deallocate memory
alloc_rec->allocator->deallocate( alloc_rec->alloc_ptr, alloc_rec->alloc_size );
// call destructor
alloc_rec->~AllocationRecord();
// wait for writes to complete
memory_fence();
// clear node index
node->clear_node_index( node_index );
}
}
void * create_singleton( size_t size, Impl::singleton_create_function_type create_func, Impl::singleton_destroy_function_type destroy_func )
{
SingletonNode * node = new SingletonNode( size, create_func, destroy_func );
SingletonNode * next;
// insert new node at the head of the list
do {
next = volatile_load(&singleton_head);
node->next = next;
} while ( next != atomic_compare_exchange( &singleton_head, next, node ) );
return node->buffer;
}
void print_memory( std::ostream & out ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
std::vector< std::string > string_vec;
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
std::ostringstream oss;
alloc_rec->print( oss );
string_vec.push_back( oss.str() );
}
}
curr = next;
} while ( curr != start );
if ( !string_vec.empty() ) {
std::sort( string_vec.begin(), string_vec.end() );
std::ostringstream oss;
oss << "Tracked Memory:" << std::endl;
for (size_t i=0; i< string_vec.size(); ++i)
{
oss << " " << string_vec[i] << std::endl;
}
out << oss.str() << std::endl;
}
else {
out << "No Tracked Memory" << std::endl;
}
}
// find an AllocationRecord such that
// alloc_ptr <= ptr < alloc_ptr + alloc_size
// otherwise return NULL
AllocationRecord * find( void const * ptr, AllocatorBase const * allocator ) const
{
AllocationNode * start = head;
AllocationNode * curr = start;
char const * const char_ptr = reinterpret_cast<const char *>(ptr);
do {
AllocationNode * next = curr->next;
// bit zero does not map to an AllocationRecord
for ( int32_t i=1; i < bitset_type::size; ++i )
{
if (curr->bitset.test(i)) {
AllocationRecord * alloc_rec = reinterpret_cast<AllocationRecord *>( curr->get_buffer(i) );
char const * const alloc_ptr = reinterpret_cast<char const *>(alloc_rec->alloc_ptr);
if ( (allocator == alloc_rec->allocator)
&& (alloc_ptr <= char_ptr)
&& (char_ptr < (alloc_ptr + alloc_rec->alloc_size)) )
{
return alloc_rec;
}
}
}
curr = next;
} while ( curr != start );
return NULL;
}
private:
AllocationNode * get_node( AllocationRecord * alloc_rec )
{
return reinterpret_cast<AllocationNode *>( alloc_rec - alloc_rec->node_index);
}
AllocationNode * head;
SingletonNode * singleton_head;
};
// create the global pool for allocation records
AllocationRecordPool global_alloc_rec_pool;
// convert a uintptr_t to an AllocationRecord pointer
inline
AllocationRecord * to_alloc_rec( uintptr_t alloc_rec )
{
return reinterpret_cast<AllocationRecord *>( alloc_rec & ~static_cast<uintptr_t>(1) );
}
} // unnamed namespace
//-----------------------------------------------------------------------------
// Allocation Tracker methods
//-----------------------------------------------------------------------------
// Create a reference counted AllocationTracker
void AllocationTracker::initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label
)
{
if ( arg_allocator && arg_alloc_ptr && arg_alloc_size) {
// create record
AllocationRecord * alloc_rec = global_alloc_rec_pool.create( arg_allocator
, arg_alloc_ptr
, arg_alloc_size
, arg_label
);
m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
}
void AllocationTracker::reallocate( size_t size ) const
{
AllocationRecord * rec = to_alloc_rec( m_alloc_rec );
void * the_alloc_ptr = rec->allocator->reallocate( rec->alloc_ptr, rec->alloc_size, size );
if ( NULL != the_alloc_ptr )
{
*const_cast<void **>(&rec->alloc_ptr) = the_alloc_ptr;
*const_cast<uint64_t *>(&rec->alloc_size) = size;
}
else {
Impl::throw_runtime_exception( "Error: unable to reallocate allocation tracker");
}
}
void AllocationTracker::increment_ref_count() const
{
to_alloc_rec( m_alloc_rec )->increment_ref_count();
}
void AllocationTracker::decrement_ref_count() const
{
AllocationRecord * alloc_rec = to_alloc_rec( m_alloc_rec );
uint32_t the_ref_count = alloc_rec->decrement_ref_count();
if (the_ref_count == 0u) {
try {
global_alloc_rec_pool.destroy( alloc_rec );
}
catch(...) {}
}
}
namespace {
struct NullAllocator { static const char * name() { return "Null Allocator"; } };
}
AllocatorBase * AllocationTracker::allocator() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->allocator;
}
return Allocator<NullAllocator>::singleton();
}
void * AllocationTracker::alloc_ptr() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_ptr;
}
return NULL;
}
size_t AllocationTracker::alloc_size() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->alloc_size;
}
return 0u;
}
size_t AllocationTracker::ref_count() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->ref_count;
}
return 0u;
}
char const * AllocationTracker::label() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->label;
}
return "[Empty Allocation Tracker]";
}
void AllocationTracker::print( std::ostream & oss) const
{
if (m_alloc_rec & REF_COUNT_MASK) {
to_alloc_rec(m_alloc_rec)->print(oss);
}
else {
oss << label();
}
}
bool AllocationTracker::set_attribute( AllocatorAttributeBase * attr ) const
{
bool result = false;
if (m_alloc_rec & REF_COUNT_MASK) {
result = to_alloc_rec(m_alloc_rec)->set_attribute(attr);
}
return result;
}
AllocatorAttributeBase * AllocationTracker::attribute() const
{
if (m_alloc_rec & REF_COUNT_MASK) {
return to_alloc_rec(m_alloc_rec)->attribute;
}
return NULL;
}
void AllocationTracker::print_tracked_memory( std::ostream & out )
{
global_alloc_rec_pool.print_memory( out );
}
AllocationTracker AllocationTracker::find( void const * ptr, AllocatorBase const * arg_allocator )
{
AllocationRecord * alloc_rec = global_alloc_rec_pool.find(ptr, arg_allocator);
AllocationTracker tracker;
if ( alloc_rec != NULL )
{
if ( tracking_enabled() ) {
alloc_rec->increment_ref_count();
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec) | REF_COUNT_BIT;
}
else {
tracker.m_alloc_rec = reinterpret_cast<uintptr_t>(alloc_rec);
}
}
return tracker ;
}
//-----------------------------------------------------------------------------
// static AllocationTracker
//-----------------------------------------------------------------------------
#if defined( KOKKOS_USE_DECENTRALIZED_HOST )
namespace {
// TODO : Detect compiler support for thread local variables
#if defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_DEFAULT_DEVICE_TYPE_THREADS )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_OPENMP )
bool g_thread_local_tracking_enabled = true;
#pragma omp threadprivate(g_thread_local_tracking_enabled)
#elif defined( KOKKOS_HAVE_PTHREAD )
__thread bool g_thread_local_tracking_enabled = true;
#elif defined( KOKKOS_HAVE_SERIAL )
bool g_thread_local_tracking_enabled = true;
#endif
} // unnamed namespace
void AllocationTracker::disable_tracking()
{
g_thread_local_tracking_enabled = false;
}
void AllocationTracker::enable_tracking()
{
g_thread_local_tracking_enabled = true;
}
bool AllocationTracker::tracking_enabled()
{
return g_thread_local_tracking_enabled;
}
#else
namespace {
enum TrackingEnum { TRACKING_ENABLED, TRACKING_DISABLED };
volatile TrackingEnum g_tracking_enabled = TRACKING_ENABLED;
}
void AllocationTracker::disable_tracking()
{
if ( TRACKING_ENABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_ENABLED, TRACKING_DISABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already disabled");
}
}
void AllocationTracker::enable_tracking()
{
if ( TRACKING_DISABLED != atomic_compare_exchange( &g_tracking_enabled, TRACKING_DISABLED, TRACKING_ENABLED ) ) {
Impl::throw_runtime_exception("Error: Tracking already enabled");
}
}
bool AllocationTracker::tracking_enabled()
{
return g_tracking_enabled == TRACKING_ENABLED;
}
#endif
//-----------------------------------------------------------------------------
// create singleton free function
//-----------------------------------------------------------------------------
void * create_singleton( size_t size
, Impl::singleton_create_function_type create_func
, Impl::singleton_destroy_function_type destroy_func )
{
return global_alloc_rec_pool.create_singleton( size, create_func, destroy_func );
}
}} // namespace Kokkos::Impl
#endif /* #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) */
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

View File

@ -1,574 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_ALLOCATION_TRACKER_HPP
#define KOKKOS_ALLOCATION_TRACKER_HPP
#include <Kokkos_Macros.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#include <impl/Kokkos_Traits.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h>
#include <cstdlib>
#include <string>
#include <iosfwd>
namespace Kokkos { namespace Impl {
//-----------------------------------------------------------------------------
// Create Singleton objects
//-----------------------------------------------------------------------------
typedef void * (*singleton_create_function_type)(void * buffer);
typedef void (*singleton_destroy_function_type)(void *);
void * create_singleton( size_t size
, singleton_create_function_type create_func
, singleton_destroy_function_type destroy_func
);
/// class Singleton
///
/// Default construct a singleton type. This method is used to circumvent
/// order of construction issues. Singleton objects are destroyed after all
/// other allocations in the reverse order of their creation.
template <typename Type>
class Singleton
{
public:
/// Get a pointer to the Singleton. Default construct the singleton if it does not already exist
static Type * get()
{
static Type * singleton = NULL;
if (singleton == NULL) {
Impl::singleton_create_function_type create_func = &create;
Impl::singleton_destroy_function_type destroy_func = &destroy;
singleton = reinterpret_cast<Type*>( Impl::create_singleton( sizeof(Type), create_func, destroy_func ) );
}
return singleton;
}
private:
/// Call the Type constructor
static void destroy(void * ptr)
{
reinterpret_cast<Type*>(ptr)->~Type();
}
/// placement new the Type in buffer
static void * create(void * buffer)
{
return new (buffer) Type();
}
};
//-----------------------------------------------------------------------------
// AllocatorBase
//-----------------------------------------------------------------------------
/// class AllocatorBase
///
/// Abstract base class for all Allocators.
/// Allocators should be singleton objects, use Singleton<Allocator>::get to create
/// to avoid order of destruction issues
class AllocatorBase
{
public:
/// name of the allocator
/// used to report memory leaks
virtual const char * name() const = 0;
/// Allocate a buffer of size number of bytes
virtual void* allocate(size_t size) const = 0;
/// Deallocate a buffer with size number of bytes
/// The pointer must have been allocated with a call to corresponding allocate
virtual void deallocate(void * ptr, size_t size) const = 0;
/// Changes the size of the memory block pointed to by ptr.
/// Ptr must have been allocated with the corresponding allocate call
/// The function may move the memory block to a new location
/// (whose address is returned by the function).
///
/// The content of the memory block is preserved up to the lesser of the new and
/// old sizes, even if the block is moved to a new location. If the new size is larger,
/// the value of the newly allocated portion is indeterminate.
///
/// In case that ptr is a null pointer, the function behaves like allocate, assigning a
/// new block of size bytes and returning a pointer to its beginning.
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const = 0;
/// can a texture object be bound to the allocated memory
virtual bool support_texture_binding() const = 0;
/// virtual destructor
virtual ~AllocatorBase() {}
};
/// class AllocatorAttributeBase
class AllocatorAttributeBase
{
public:
virtual ~AllocatorAttributeBase() {}
};
//-----------------------------------------------------------------------------
// Allocator< StaticAllocator > : public AllocatorBase
//-----------------------------------------------------------------------------
// HasStaticName
template<typename T>
class HasStaticName
{
typedef const char * (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::name>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticName<T>::value, const char *>::type
allocator_name()
{
return T::name();
}
template <typename T>
inline
typename enable_if<!HasStaticName<T>::value, const char *>::type
allocator_name()
{
return "Unnamed Allocator";
}
// HasStaticAllocate
template<typename T>
class HasStaticAllocate
{
typedef void * (*static_method)(size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::allocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t size)
{
return T::allocate(size);
}
template <typename T>
inline
typename enable_if<!HasStaticAllocate<T>::value, void *>::type
allocator_allocate(size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot allocate memory!") );
return NULL;
}
// HasStaticDeallocate
template<typename T>
class HasStaticDeallocate
{
typedef void (*static_method)(void *, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::deallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void * ptr, size_t size)
{
T::deallocate(ptr,size);
}
template <typename T>
inline
typename enable_if<!HasStaticDeallocate<T>::value, void>::type
allocator_deallocate(void *, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot deallocate memory!") );
}
// HasStaticReallocate
template<typename T>
class HasStaticReallocate
{
typedef void * (*static_method)(void *, size_t, size_t);
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::reallocate>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
return T::reallocate(old_ptr, old_size, new_size);
}
template <typename T>
inline
typename enable_if<!HasStaticReallocate<T>::value, void *>::type
allocator_reallocate(void *, size_t, size_t)
{
throw_runtime_exception( std::string("Error: ")
+ std::string(allocator_name<T>())
+ std::string(" cannot reallocate memory!") );
return NULL;
}
// HasStaticReallocate
template<typename T>
class HasStaticSupportTextureBinding
{
typedef bool (*static_method)();
template<typename U, static_method> struct SFINAE {};
template<typename U> static char Test(SFINAE<U, &U::support_texture_binding>*);
template<typename U> static int Test(...);
public:
enum { value = sizeof(Test<T>(0)) == sizeof(char) };
};
template <typename T>
inline
typename enable_if<HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return T::support_texture_binding();
}
template <typename T>
inline
typename enable_if<!HasStaticSupportTextureBinding<T>::value, bool>::type
allocator_support_texture_binding()
{
return false;
}
template <typename T>
class Allocator : public AllocatorBase
{
public:
virtual const char * name() const
{
return allocator_name<T>();
}
virtual void* allocate(size_t size) const
{
return allocator_allocate<T>(size);
}
virtual void deallocate(void * ptr, size_t size) const
{
allocator_deallocate<T>(ptr,size);
}
virtual void * reallocate(void * old_ptr, size_t old_size, size_t new_size) const
{
return allocator_reallocate<T>(old_ptr, old_size, new_size);
}
virtual bool support_texture_binding() const
{
return allocator_support_texture_binding<T>();
}
static AllocatorBase * singleton()
{
return Singleton< Allocator<T> >::get();
}
};
//-----------------------------------------------------------------------------
// AllocationTracker
//-----------------------------------------------------------------------------
// forward declaration for friend classes
struct MallocHelper;
/// class AllocationTracker
/// Will call deallocate from the AllocatorBase when the reference count reaches 0.
/// Reference counting is disabled when the host is in parallel.
class AllocationTracker
{
// use the least significant bit of the AllocationRecord pointer to indicate if the
// AllocationTracker should reference count
enum {
REF_COUNT_BIT = static_cast<uintptr_t>(1)
, REF_COUNT_MASK = ~static_cast<uintptr_t>(1)
};
public:
/// Find an AllocationTracker such that
/// alloc_ptr <= ptr < alloc_ptr + alloc_size
/// O(n) where n is the number of tracked allocations.
template <typename StaticAllocator>
static AllocationTracker find( void const * ptr )
{
return find( ptr, Allocator<StaticAllocator>::singleton() );
}
/// Pretty print all the currently tracked memory
static void print_tracked_memory( std::ostream & out );
/// Default constructor
KOKKOS_INLINE_FUNCTION
AllocationTracker()
: m_alloc_rec(0)
{}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, void * arg_alloc_ptr
, size_t arg_alloc_size
, const std::string & arg_label = std::string("") )
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Create a AllocationTracker
///
/// Start reference counting the alloc_ptr.
/// When the reference count reachs 0 the allocator deallocate method
/// will be call with the given size. The alloc_ptr should have been
/// allocated with the allocator's allocate method.
///
/// If arg_allocator == NULL OR arg_alloc_ptr == NULL OR size == 0
/// do nothing
template <typename StaticAllocator>
AllocationTracker( StaticAllocator const &
, size_t arg_alloc_size
, const std::string & arg_label = std::string("")
)
: m_alloc_rec(0)
{
AllocatorBase * arg_allocator = Allocator<StaticAllocator>::singleton();
void * arg_alloc_ptr = arg_allocator->allocate( arg_alloc_size );
initalize( arg_allocator, arg_alloc_ptr, arg_alloc_size, arg_label);
}
/// Copy an AllocatorTracker
KOKKOS_INLINE_FUNCTION
AllocationTracker( const AllocationTracker & rhs )
: m_alloc_rec( rhs.m_alloc_rec)
{
#if !defined( __CUDA_ARCH__ )
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
#endif
}
/// Copy an AllocatorTracker
/// Decrement the reference count of the current tracker if necessary
KOKKOS_INLINE_FUNCTION
AllocationTracker & operator=( const AllocationTracker & rhs )
{
if (this != &rhs) {
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
m_alloc_rec = rhs.m_alloc_rec;
if ( rhs.ref_counting() && tracking_enabled() ) {
increment_ref_count();
}
else {
m_alloc_rec = m_alloc_rec & REF_COUNT_MASK;
}
#else
m_alloc_rec = rhs.m_alloc_rec & REF_COUNT_MASK;
#endif
}
return * this;
}
/// Destructor
/// Decrement the reference count if necessary
KOKKOS_INLINE_FUNCTION
~AllocationTracker()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
}
/// Is the tracker valid?
KOKKOS_INLINE_FUNCTION
bool is_valid() const
{
return (m_alloc_rec & REF_COUNT_MASK);
}
/// clear the tracker
KOKKOS_INLINE_FUNCTION
void clear()
{
#if !defined( __CUDA_ARCH__ )
if ( ref_counting() ) {
decrement_ref_count();
}
#endif
m_alloc_rec = 0;
}
/// is this tracker currently counting allocations?
KOKKOS_INLINE_FUNCTION
bool ref_counting() const
{
return (m_alloc_rec & REF_COUNT_BIT);
}
AllocatorBase * allocator() const;
/// pointer to the allocated memory
void * alloc_ptr() const;
/// size in bytes of the allocated memory
size_t alloc_size() const;
/// the current reference count
size_t ref_count() const;
/// the label given to the allocation
char const * label() const;
/// pretty print all the tracker's information to the std::ostream
void print( std::ostream & oss) const;
/// set an attribute ptr on the allocation record
/// the arg_attribute pointer will be deleted when the record is destroyed
/// the attribute ptr can only be set once
bool set_attribute( AllocatorAttributeBase * arg_attribute) const;
/// get the attribute ptr from the allocation record
AllocatorAttributeBase * attribute() const;
/// reallocate the memory tracked by this allocation
/// NOT thread-safe
void reallocate( size_t size ) const;
static void disable_tracking();
static void enable_tracking();
static bool tracking_enabled();
private:
static AllocationTracker find( void const * ptr, AllocatorBase const * arg_allocator );
void initalize( AllocatorBase * arg_allocator
, void * arg_alloc_ptr
, size_t arg_alloc_size
, std::string const & label );
void increment_ref_count() const;
void decrement_ref_count() const;
friend struct Impl::MallocHelper;
uintptr_t m_alloc_rec;
};
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
#endif //KOKKOS_ALLOCATION_TRACKER_HPP

View File

@ -0,0 +1,197 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#ifndef KOKKOS_IMPL_ANALYZE_POLICY_HPP
#define KOKKOS_IMPL_ANALYZE_POLICY_HPP
#include <Kokkos_Core_fwd.hpp>
#include <Kokkos_Concepts.hpp>
#include <impl/Kokkos_Tags.hpp>
namespace Kokkos { namespace Impl {
template < typename ExecutionSpace = void
, typename Schedule = void
, typename WorkTag = void
, typename IndexType = void
, typename IterationPattern = void
>
struct PolicyTraitsBase
{
using type = PolicyTraitsBase< ExecutionSpace, Schedule, WorkTag, IndexType, IterationPattern>;
using execution_space = ExecutionSpace;
using schedule_type = Schedule;
using work_tag = WorkTag;
using index_type = IndexType;
using iteration_pattern = IterationPattern;
};
template <typename PolicyBase, typename ExecutionSpace>
struct SetExecutionSpace
{
static_assert( is_void<typename PolicyBase::execution_space>::value
, "Kokkos Error: More than one execution space given" );
using type = PolicyTraitsBase< ExecutionSpace
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename Schedule>
struct SetSchedule
{
static_assert( is_void<typename PolicyBase::schedule_type>::value
, "Kokkos Error: More than one schedule type given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, Schedule
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename WorkTag>
struct SetWorkTag
{
static_assert( is_void<typename PolicyBase::work_tag>::value
, "Kokkos Error: More than one work tag given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, WorkTag
, typename PolicyBase::index_type
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename IndexType>
struct SetIndexType
{
static_assert( is_void<typename PolicyBase::index_type>::value
, "Kokkos Error: More than one index type given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, IndexType
, typename PolicyBase::iteration_pattern
>;
};
template <typename PolicyBase, typename IterationPattern>
struct SetIterationPattern
{
static_assert( is_void<typename PolicyBase::iteration_pattern>::value
, "Kokkos Error: More than one iteration_pattern given" );
using type = PolicyTraitsBase< typename PolicyBase::execution_space
, typename PolicyBase::schedule_type
, typename PolicyBase::work_tag
, typename PolicyBase::index_type
, IterationPattern
>;
};
template <typename Base, typename... Traits>
struct AnalyzePolicy;
template <typename Base, typename T, typename... Traits>
struct AnalyzePolicy<Base, T, Traits...> : public
AnalyzePolicy<
typename std::conditional< is_execution_space<T>::value , SetExecutionSpace<Base,T>
, typename std::conditional< is_schedule_type<T>::value , SetSchedule<Base,T>
, typename std::conditional< is_index_type<T>::value , SetIndexType<Base,T>
, typename std::conditional< std::is_integral<T>::value , SetIndexType<Base, IndexType<T> >
, typename std::conditional< is_iteration_pattern<T>::value, SetIterationPattern<Base,T>
, SetWorkTag<Base,T>
>::type >::type >::type >::type>::type::type
, Traits...
>
{};
template <typename Base>
struct AnalyzePolicy<Base>
{
using execution_space = typename std::conditional< is_void< typename Base::execution_space >::value
, DefaultExecutionSpace
, typename Base::execution_space
>::type;
using schedule_type = typename std::conditional< is_void< typename Base::schedule_type >::value
, Schedule< Static >
, typename Base::schedule_type
>::type;
using work_tag = typename Base::work_tag;
using index_type = typename std::conditional< is_void< typename Base::index_type >::value
, IndexType< typename execution_space::size_type >
, typename Base::index_type
>::type
::type // nasty hack to make index_type into an integral_type
; // instead of the wrapped IndexType<T> for backwards compatibility
using iteration_pattern = typename std::conditional< is_void< typename Base::iteration_pattern >::value
, void // TODO set default iteration pattern
, typename Base::iteration_pattern
>::type;
using type = PolicyTraitsBase< execution_space
, schedule_type
, work_tag
, index_type
, iteration_pattern
>;
};
template <typename... Traits>
struct PolicyTraits
: public AnalyzePolicy< PolicyTraitsBase<>, Traits... >::type
{};
}} // namespace Kokkos::Impl
#endif //KOKKOS_IMPL_ANALYZE_POLICY_HPP

View File

@ -218,7 +218,17 @@ T atomic_compare_exchange( volatile T * const dest , const T compare ,
while( !Impl::lock_address_host_space( (void*) dest ) ); while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest; T return_val = *dest;
if( return_val == compare ) { if( return_val == compare ) {
const T tmp = *dest = val; // Don't use the following line of code here:
//
//const T tmp = *dest = val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = val;
const T tmp = *dest;
#ifndef KOKKOS_COMPILER_CLANG #ifndef KOKKOS_COMPILER_CLANG
(void) tmp; (void) tmp;
#endif #endif

View File

@ -228,7 +228,17 @@ T atomic_exchange( volatile T * const dest ,
{ {
while( !Impl::lock_address_host_space( (void*) dest ) ); while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest; T return_val = *dest;
const T tmp = *dest = val; // Don't use the following line of code here:
//
//const T tmp = *dest = val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = val;
const T tmp = *dest;
#ifndef KOKKOS_COMPILER_CLANG #ifndef KOKKOS_COMPILER_CLANG
(void) tmp; (void) tmp;
#endif #endif
@ -305,7 +315,9 @@ void atomic_assign( volatile T * const dest ,
// member. The volatile return value implicitly defines a // member. The volatile return value implicitly defines a
// dereference that some compilers (gcc 4.7.2) warn is being ignored. // dereference that some compilers (gcc 4.7.2) warn is being ignored.
// Suppress warning by casting return to void. // Suppress warning by casting return to void.
(void)( *dest = val ); //(void)( *dest = val );
*dest = val;
Impl::unlock_address_host_space( (void*) dest ); Impl::unlock_address_host_space( (void*) dest );
} }
//---------------------------------------------------------------------------- //----------------------------------------------------------------------------

View File

@ -93,7 +93,7 @@ T atomic_fetch_add( volatile T * const dest ,
assume.i = oldval.i ; assume.i = oldval.i ;
newval.t = assume.t + val ; newval.t = assume.t + val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i ); } while ( assume.i != oldval.i );
return oldval.t ; return oldval.t ;
} }
@ -156,9 +156,26 @@ T atomic_fetch_add( volatile T * const dest ,
#elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL) #elif defined(KOKKOS_ATOMICS_USE_GCC) || defined(KOKKOS_ATOMICS_USE_INTEL)
#if defined( KOKKOS_ENABLE_ASM ) && defined ( KOKKOS_USE_ISA_X86_64 )
KOKKOS_INLINE_FUNCTION
int atomic_fetch_add( volatile int * dest , const int val )
{
int original = val;
__asm__ __volatile__(
"lock xadd %1, %0"
: "+m" (*dest), "+r" (original)
: "m" (*dest), "r" (original)
: "memory"
);
return original;
}
#else
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
int atomic_fetch_add( volatile int * const dest , const int val ) int atomic_fetch_add( volatile int * const dest , const int val )
{ return __sync_fetch_and_add(dest,val); } { return __sync_fetch_and_add(dest, val); }
#endif
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
long int atomic_fetch_add( volatile long int * const dest , const long int val ) long int atomic_fetch_add( volatile long int * const dest , const long int val )
@ -276,7 +293,17 @@ T atomic_fetch_add( volatile T * const dest ,
{ {
while( !Impl::lock_address_host_space( (void*) dest ) ); while( !Impl::lock_address_host_space( (void*) dest ) );
T return_val = *dest; T return_val = *dest;
const T tmp = *dest = return_val + val; // Don't use the following line of code here:
//
//const T tmp = *dest = return_val + val;
//
// Instead, put each assignment in its own statement. This is
// because the overload of T::operator= for volatile *this should
// return void, not volatile T&. See Kokkos #177:
//
// https://github.com/kokkos/kokkos/issues/177
*dest = return_val + val;
const T tmp = *dest;
(void) tmp; (void) tmp;
Impl::unlock_address_host_space( (void*) dest ); Impl::unlock_address_host_space( (void*) dest );
return return_val; return return_val;

View File

@ -73,7 +73,7 @@ T atomic_fetch_sub( volatile T * const dest ,
assume.i = oldval.i ; assume.i = oldval.i ;
newval.t = assume.t - val ; newval.t = assume.t - val ;
oldval.i = atomicCAS( (int*)dest , assume.i , newval.i ); oldval.i = atomicCAS( (int*)dest , assume.i , newval.i );
} while ( assumed.i != oldval.i ); } while ( assume.i != oldval.i );
return oldval.t ; return oldval.t ;
} }

View File

@ -48,6 +48,22 @@
namespace Kokkos { namespace Kokkos {
namespace Impl { namespace Impl {
template<class Scalar1, class Scalar2>
struct MaxOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return (val1 > val2 ? val1 : val2);
}
};
template<class Scalar1, class Scalar2>
struct MinOper {
KOKKOS_FORCEINLINE_FUNCTION
static Scalar1 apply(const Scalar1& val1, const Scalar2& val2) {
return (val1 < val2 ? val1 : val2);
}
};
template<class Scalar1, class Scalar2> template<class Scalar1, class Scalar2>
struct AddOper { struct AddOper {
KOKKOS_FORCEINLINE_FUNCTION KOKKOS_FORCEINLINE_FUNCTION
@ -276,6 +292,18 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest ,
namespace Kokkos { namespace Kokkos {
// Fetch_Oper atomics: return value before operation // Fetch_Oper atomics: return value before operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_max(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::MaxOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_fetch_min(volatile T * const dest, const T val) {
return Impl::atomic_fetch_oper(Impl::MinOper<T,const T>(),dest,val);
}
template < typename T > template < typename T >
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
T atomic_fetch_mul(volatile T * const dest, const T val) { T atomic_fetch_mul(volatile T * const dest, const T val) {
@ -326,6 +354,18 @@ T atomic_fetch_rshift(volatile T * const dest, const unsigned int val) {
// Oper Fetch atomics: return value after operation // Oper Fetch atomics: return value after operation
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_max_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::MaxOper<T,const T>(),dest,val);
}
template < typename T >
KOKKOS_INLINE_FUNCTION
T atomic_min_fetch(volatile T * const dest, const T val) {
return Impl::atomic_oper_fetch(Impl::MinOper<T,const T>(),dest,val);
}
template < typename T > template < typename T >
KOKKOS_INLINE_FUNCTION KOKKOS_INLINE_FUNCTION
T atomic_mul_fetch(volatile T * const dest, const T val) { T atomic_mul_fetch(volatile T * const dest, const T val) {

View File

@ -425,42 +425,6 @@ struct Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<8> {
typedef int64_t type; typedef int64_t type;
}; };
#if ! KOKKOS_USING_EXP_VIEW
class AllocationTracker;
// Must be non-const, atomic access trait, and 32 or 64 bit type for true atomics.
template<class ViewTraits>
class ViewDataHandle<
ViewTraits ,
typename enable_if<
( ! is_same<typename ViewTraits::const_value_type,typename ViewTraits::value_type>::value) &&
( ViewTraits::memory_traits::Atomic )
>::type >
{
private:
// typedef typename if_c<(sizeof(typename ViewTraits::const_value_type)==4) ||
// (sizeof(typename ViewTraits::const_value_type)==8),
// int, Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars >::type
// atomic_view_possible;
typedef typename Kokkos_Atomic_is_only_allowed_with_32bit_and_64bit_scalars<sizeof(typename ViewTraits::const_value_type)>::type enable_atomic_type;
typedef ViewDataHandle self_type;
public:
enum { ReturnTypeIsReference = false };
typedef Impl::AtomicViewDataHandle<ViewTraits> handle_type;
typedef Impl::AtomicDataElement<ViewTraits> return_type;
KOKKOS_INLINE_FUNCTION
static handle_type create_handle( typename ViewTraits::value_type * arg_data_ptr, AllocationTracker const & /*arg_tracker*/ )
{
return handle_type(arg_data_ptr);
}
};
#endif /* #if ! KOKKOS_USING_EXP_VIEW */
}} // namespace Kokkos::Impl }} // namespace Kokkos::Impl
#endif #endif

View File

@ -1,287 +0,0 @@
/*
//@HEADER
// ************************************************************************
//
// Kokkos v. 2.0
// Copyright (2014) Sandia Corporation
//
// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation,
// the U.S. Government retains certain rights in this software.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
//
// 3. Neither the name of the Corporation nor the names of the
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY
// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE
// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov)
//
// ************************************************************************
//@HEADER
*/
#include <Kokkos_HostSpace.hpp>
#if ! KOKKOS_USING_EXP_VIEW
#include <impl/Kokkos_BasicAllocators.hpp>
#include <impl/Kokkos_Error.hpp>
#include <stdint.h> // uintptr_t
#include <cstdlib> // for malloc, realloc, and free
#include <cstring> // for memcpy
#if defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
#include <sys/mman.h> // for mmap, munmap, MAP_ANON, etc
#include <unistd.h> // for sysconf, _SC_PAGE_SIZE, _SC_PHYS_PAGES
#endif
#include <sstream>
namespace Kokkos { namespace Impl {
/*--------------------------------------------------------------------------*/
void* MallocAllocator::allocate( size_t size )
{
void * ptr = NULL;
if (size) {
ptr = malloc(size);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void MallocAllocator::deallocate( void * ptr, size_t /*size*/ )
{
if (ptr) {
free(ptr);
}
}
void * MallocAllocator::reallocate(void * old_ptr, size_t /*old_size*/, size_t new_size)
{
void * ptr = realloc(old_ptr, new_size);
if (new_size > 0u && ptr == NULL) {
throw_runtime_exception("Error: Malloc Allocator could not reallocate memory");
}
return ptr;
}
/*--------------------------------------------------------------------------*/
namespace {
void * raw_aligned_allocate( size_t size, size_t alignment )
{
void * ptr = NULL;
if ( size ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
ptr = _mm_malloc( size , alignment );
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
posix_memalign( & ptr, alignment , size );
#else
// Over-allocate to and round up to guarantee proper alignment.
size_t size_padded = size + alignment + sizeof(void *);
void * alloc_ptr = malloc( size_padded );
if (alloc_ptr) {
uintptr_t address = reinterpret_cast<uintptr_t>(alloc_ptr);
// offset enough to record the alloc_ptr
address += sizeof(void *);
uintptr_t rem = address % alignment;
uintptr_t offset = rem ? (alignment - rem) : 0u;
address += offset;
ptr = reinterpret_cast<void *>(address);
// record the alloc'd pointer
address -= sizeof(void *);
*reinterpret_cast<void **>(address) = alloc_ptr;
}
#endif
}
return ptr;
}
void raw_aligned_deallocate( void * ptr, size_t /*size*/ )
{
if ( ptr ) {
#if defined( __INTEL_COMPILER ) && !defined ( KOKKOS_HAVE_CUDA )
_mm_free( ptr );
#elif defined(KOKKOS_POSIX_MEMALIGN_AVAILABLE)
free( ptr );
#else
// get the alloc'd pointer
void * alloc_ptr = *(reinterpret_cast<void **>(ptr) -1);
free( alloc_ptr );
#endif
}
}
}
void* AlignedAllocator::allocate( size_t size )
{
void * ptr = 0 ;
if ( size ) {
ptr = raw_aligned_allocate(size, MEMORY_ALIGNMENT);
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void AlignedAllocator::deallocate( void * ptr, size_t size )
{
raw_aligned_deallocate( ptr, size);
}
void * AlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = old_ptr;;
if (old_size < new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, old_size );
deallocate( old_ptr, old_size );
}
return ptr;
}
/*--------------------------------------------------------------------------*/
// mmap flags for private anonymous memory allocation
#if defined( MAP_ANONYMOUS ) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANONYMOUS)
#elif defined( MAP_ANON) && defined( MAP_PRIVATE )
#define MMAP_FLAGS (MAP_PRIVATE | MAP_ANON)
#else
#define NO_MMAP
#endif
// huge page tables
#if !defined( NO_MMAP )
#if defined( MAP_HUGETLB )
#define MMAP_FLAGS_HUGE (MMAP_FLAGS | MAP_HUGETLB )
#elif defined( MMAP_FLAGS )
#define MMAP_FLAGS_HUGE MMAP_FLAGS
#endif
// threshold to use huge pages
#define MMAP_USE_HUGE_PAGES (1u << 27)
#endif
// read write access to private memory
#if !defined( NO_MMAP )
#define MMAP_PROTECTION (PROT_READ | PROT_WRITE)
#endif
void* PageAlignedAllocator::allocate( size_t size )
{
void *ptr = NULL;
if (size) {
#if !defined NO_MMAP
if ( size < MMAP_USE_HUGE_PAGES ) {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS, -1 /*file descriptor*/, 0 /*offset*/);
} else {
ptr = mmap( NULL, size, MMAP_PROTECTION, MMAP_FLAGS_HUGE, -1 /*file descriptor*/, 0 /*offset*/);
}
if (ptr == MAP_FAILED) {
ptr = NULL;
}
#else
static const size_t page_size = 4096; // TODO: read in from sysconf( _SC_PAGE_SIZE )
ptr = raw_aligned_allocate( size, page_size);
#endif
if (!ptr)
{
std::ostringstream msg ;
msg << name() << ": allocate(" << size << ") FAILED";
throw_runtime_exception( msg.str() );
}
}
return ptr;
}
void PageAlignedAllocator::deallocate( void * ptr, size_t size )
{
#if !defined( NO_MMAP )
munmap(ptr, size);
#else
raw_aligned_deallocate(ptr, size);
#endif
}
void * PageAlignedAllocator::reallocate(void * old_ptr, size_t old_size, size_t new_size)
{
void * ptr = NULL;
#if defined( NO_MMAP ) || defined( __APPLE__ ) || defined( __CYGWIN__ )
if (old_size != new_size) {
ptr = allocate( new_size );
memcpy(ptr, old_ptr, (old_size < new_size ? old_size : new_size) );
deallocate( old_ptr, old_size );
}
else {
ptr = old_ptr;
}
#else
ptr = mremap( old_ptr, old_size, new_size, MREMAP_MAYMOVE );
if (ptr == MAP_FAILED) {
throw_runtime_exception("Error: Page Aligned Allocator could not reallocate memory");
}
#endif
return ptr;
}
}} // namespace Kokkos::Impl
#endif /* #if ! KOKKOS_USING_EXP_VIEW */

Some files were not shown because too many files have changed in this diff Show More