diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index d414056187..e3de6b048d 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,75 @@ # Change Log +## [2.5.00](https://github.com/kokkos/kokkos/tree/2.5.00) (2017-12-15) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.11...2.5.00) + +**Part of the Kokkos C++ Performance Portability Programming EcoSystem 2.5** + +**Implemented enhancements:** + +- Provide Makefile.kokkos logic for CMake and TriBITS [\#878](https://github.com/kokkos/kokkos/issues/878) +- Add Scatter View [\#825](https://github.com/kokkos/kokkos/issues/825) +- Drop gcc 4.7 and intel 14 from supported compiler list [\#603](https://github.com/kokkos/kokkos/issues/603) +- Enable construction of unmanaged view using common\_view\_alloc\_prop [\#1170](https://github.com/kokkos/kokkos/issues/1170) +- Unused Function Warning with XL [\#1267](https://github.com/kokkos/kokkos/issues/1267) +- Add memory pool parameter check [\#1218](https://github.com/kokkos/kokkos/issues/1218) +- CUDA9: Fix warning for unsupported long double [\#1189](https://github.com/kokkos/kokkos/issues/1189) +- CUDA9: fix warning on defaulted function marking [\#1188](https://github.com/kokkos/kokkos/issues/1188) +- CUDA9: fix warnings for deprecated warp level functions [\#1187](https://github.com/kokkos/kokkos/issues/1187) +- Add CUDA 9.0 nightly testing [\#1174](https://github.com/kokkos/kokkos/issues/1174) +- {OMPI,MPICH}\_CXX hack breaks nvcc\_wrapper use case [\#1166](https://github.com/kokkos/kokkos/issues/1166) +- KOKKOS\_HAVE\_CUDA\_LAMBDA became KOKKOS\_CUDA\_USE\_LAMBDA [\#1274](https://github.com/kokkos/kokkos/issues/1274) + +**Fixed bugs:** + +- MinMax Reducer with tagged operator doesn't compile [\#1251](https://github.com/kokkos/kokkos/issues/1251) +- Reducers for Tagged operators give wrong answer [\#1250](https://github.com/kokkos/kokkos/issues/1250) +- Kokkos not Compatible with Big Endian Machines? [\#1235](https://github.com/kokkos/kokkos/issues/1235) +- Parallel Scan hangs forever on BG/Q [\#1234](https://github.com/kokkos/kokkos/issues/1234) +- Threads backend doesn't compile with Clang on OS X [\#1232](https://github.com/kokkos/kokkos/issues/1232) +- $\(shell date\) needs quote [\#1264](https://github.com/kokkos/kokkos/issues/1264) +- Unqualified parallel\_for call conflicts with user-defined parallel\_for [\#1219](https://github.com/kokkos/kokkos/issues/1219) +- KokkosAlgorithms: CMake issue in unit tests [\#1212](https://github.com/kokkos/kokkos/issues/1212) +- Intel 18 Error: "simd pragma has been deprecated" [\#1210](https://github.com/kokkos/kokkos/issues/1210) +- Memory leak in Kokkos::initialize [\#1194](https://github.com/kokkos/kokkos/issues/1194) +- CUDA9: compiler error with static assert template arguments [\#1190](https://github.com/kokkos/kokkos/issues/1190) +- Kokkos::Serial::is\_initialized returns always true [\#1184](https://github.com/kokkos/kokkos/issues/1184) +- Triple nested parallelism still fails on bowman [\#1093](https://github.com/kokkos/kokkos/issues/1093) +- OpenMP openmp.range on Develop Runs Forever on POWER7+ with RHEL7 and GCC4.8.5 [\#995](https://github.com/kokkos/kokkos/issues/995) +- Rendezvous performance at global scope [\#985](https://github.com/kokkos/kokkos/issues/985) + + +## [2.04.11](https://github.com/kokkos/kokkos/tree/2.04.11) (2017-10-28) +[Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.04...2.04.11) + +**Implemented enhancements:** + +- Add Subview pattern. [\#648](https://github.com/kokkos/kokkos/issues/648) +- Add Kokkos "global" is\_initialized [\#1060](https://github.com/kokkos/kokkos/issues/1060) +- Add create\_mirror\_view\_and\_copy [\#1161](https://github.com/kokkos/kokkos/issues/1161) +- Add KokkosConcepts SpaceAccessibility function [\#1092](https://github.com/kokkos/kokkos/issues/1092) +- Option to Disable Initialize Warnings [\#1142](https://github.com/kokkos/kokkos/issues/1142) +- Mature task-DAG capability [\#320](https://github.com/kokkos/kokkos/issues/320) +- Promote Work DAG from experimental [\#1126](https://github.com/kokkos/kokkos/issues/1126) +- Implement new WorkGraph push/pop [\#1108](https://github.com/kokkos/kokkos/issues/1108) +- Kokkos\_ENABLE\_Cuda\_Lambda should default ON [\#1101](https://github.com/kokkos/kokkos/issues/1101) +- Add multidimensional parallel for example and improve unit test [\#1064](https://github.com/kokkos/kokkos/issues/1064) +- Fix ROCm: Performance tests not building [\#1038](https://github.com/kokkos/kokkos/issues/1038) +- Make KOKKOS\_ALIGN\_SIZE a configure-time option [\#1004](https://github.com/kokkos/kokkos/issues/1004) +- Make alignment consistent [\#809](https://github.com/kokkos/kokkos/issues/809) +- Improve subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615) + +**Fixed bugs:** + +- Kokkos::vector fixes for application [\#1134](https://github.com/kokkos/kokkos/issues/1134) +- DynamicView non-power of two value\_type [\#1177](https://github.com/kokkos/kokkos/issues/1177) +- Memory pool bug [\#1154](https://github.com/kokkos/kokkos/issues/1154) +- Cuda launch bounds performance regression bug [\#1140](https://github.com/kokkos/kokkos/issues/1140) +- Significant performance regression in LAMMPS after updating Kokkos [\#1139](https://github.com/kokkos/kokkos/issues/1139) +- CUDA compile error [\#1128](https://github.com/kokkos/kokkos/issues/1128) +- MDRangePolicy neg idx test failure in debug mode [\#1113](https://github.com/kokkos/kokkos/issues/1113) +- subview construction on Cuda backend [\#615](https://github.com/kokkos/kokkos/issues/615) + ## [2.04.04](https://github.com/kokkos/kokkos/tree/2.04.04) (2017-09-11) [Full Changelog](https://github.com/kokkos/kokkos/compare/2.04.00...2.04.04) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index b2771ed527..9c708ded4a 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -1,3 +1,5 @@ +# Is this a build as part of Trilinos? + IF(COMMAND TRIBITS_PACKAGE_DECL) SET(KOKKOS_HAS_TRILINOS ON CACHE BOOL "") ELSE() @@ -6,13 +8,57 @@ ENDIF() IF(NOT KOKKOS_HAS_TRILINOS) cmake_minimum_required(VERSION 3.1 FATAL_ERROR) - project(Kokkos CXX) - INCLUDE(cmake/kokkos.cmake) + # Define Project Name if this is a standalone build + IF(NOT DEFINED ${PROJECT_NAME}) + project(Kokkos CXX) + ENDIF() + + # Basic initialization (Used in KOKKOS_SETTINGS) + set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) + set(KOKKOS_PATH ${KOKKOS_SRC_PATH}) + + #------------ COMPILER AND FEATURE CHECKS ------------------------------------ + include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) + set_kokkos_cxx_compiler() + set_kokkos_cxx_standard() + + #------------ GET OPTIONS AND KOKKOS_SETTINGS -------------------------------- + # Add Kokkos' modules to CMake's module path. + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + + set(KOKKOS_CMAKE_VERBOSE True) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_options.cmake) + + include(${KOKKOS_SRC_PATH}/cmake/kokkos_settings.cmake) + + #------------ GENERATE HEADER AND SOURCE FILES ------------------------------- + execute_process( + COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRC_PATH}/cmake/Makefile.generate_cmake_settings CXX=${CMAKE_CXX_COMPILER} generate_build_settings + WORKING_DIRECTORY "${Kokkos_BINARY_DIR}" + OUTPUT_FILE ${Kokkos_BINARY_DIR}/core_src_make.out + RESULT_VARIABLE res + ) + include(${Kokkos_BINARY_DIR}/kokkos_generated_settings.cmake) + set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC}) + + #------------ NOW BUILD ------------------------------------------------------ + include(${KOKKOS_SRC_PATH}/cmake/kokkos_build.cmake) + + #------------ Add in Fake Tribits Handling to allow unit test builds- -------- + + include(${KOKKOS_SRC_PATH}/cmake/tribits.cmake) + + TRIBITS_PACKAGE_DECL(Kokkos) + + ADD_SUBDIRECTORY(core) + ADD_SUBDIRECTORY(containers) + ADD_SUBDIRECTORY(algorithms) + ELSE() #------------------------------------------------------------------------------ # -# A) Forward delcare the package so that certain options are also defined for +# A) Forward declare the package so that certain options are also defined for # subpackages # @@ -21,178 +67,28 @@ TRIBITS_PACKAGE_DECL(Kokkos) # ENABLE_SHADOWING_WARNINGS) #------------------------------------------------------------------------------ # -# B) Define the common options for Kokkos first so they can be used by -# subpackages as well. +# B) Install Kokkos' build files # +# If using the Makefile-generated files, then need to set things up. +# Here, assume that TriBITS has been run from ProjectCompilerPostConfig.cmake +# and already generated KokkosCore_config.h and kokkos_generated_settings.cmake +# in the previously define Kokkos_GEN_DIR +# We need to copy them over to the correct place and source the cmake file -# mfh 01 Aug 2016: See Issue #61: -# -# https://github.com/kokkos/kokkos/issues/61 -# -# Don't use TRIBITS_ADD_DEBUG_OPTION() here, because that defines -# HAVE_KOKKOS_DEBUG. We define KOKKOS_HAVE_DEBUG here instead, -# for compatibility with Kokkos' Makefile build system. +if(NOT KOKKOS_LEGACY_TRIBITS) + set(Kokkos_GEN_DIR ${CMAKE_BINARY_DIR}) + file(COPY "${Kokkos_GEN_DIR}/KokkosCore_config.h" + DESTINATION "${CMAKE_CURRENT_BINARY_DIR}" USE_SOURCE_PERMISSIONS) + install(FILES "${Kokkos_GEN_DIR}/KokkosCore_config.h" + DESTINATION include) + file(COPY "${Kokkos_GEN_DIR}/kokkos_generated_settings.cmake" + DESTINATION "${CMAKE_CURRENT_BINARY_DIR}" USE_SOURCE_PERMISSIONS) -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_DEBUG - KOKKOS_HAVE_DEBUG - "Enable run-time debug checks. These checks may be expensive, so they are disabled by default in a release build." - ${${PROJECT_NAME}_ENABLE_DEBUG} -) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_SIERRA_BUILD - KOKKOS_FOR_SIERRA - "Configure Kokkos for building within the Sierra build system." - OFF - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Cuda - KOKKOS_HAVE_CUDA - "Enable CUDA support in Kokkos." - "${TPL_ENABLE_CUDA}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Cuda_UVM - KOKKOS_USE_CUDA_UVM - "Enable CUDA Unified Virtual Memory as the default in Kokkos." - OFF - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Cuda_RDC - KOKKOS_HAVE_CUDA_RDC - "Enable CUDA Relocatable Device Code support in Kokkos." - OFF - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Cuda_Lambda - KOKKOS_HAVE_CUDA_LAMBDA - "Enable CUDA LAMBDA support in Kokkos." - OFF - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Pthread - KOKKOS_HAVE_PTHREAD - "Enable Pthread support in Kokkos." - OFF - ) - -ASSERT_DEFINED(TPL_ENABLE_Pthread) -IF(Kokkos_ENABLE_Pthread AND NOT TPL_ENABLE_Pthread) - MESSAGE(FATAL_ERROR "You set Kokkos_ENABLE_Pthread=ON, but Trilinos' support for Pthread(s) is not enabled (TPL_ENABLE_Pthread=OFF). This is not allowed. Please enable Pthreads in Trilinos before attempting to enable Kokkos' support for Pthreads.") -ENDIF() -IF(NOT TPL_ENABLE_Pthread) - ADD_DEFINITIONS(-DGTEST_HAS_PTHREAD=0) -ENDIF() - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_OpenMP - KOKKOS_HAVE_OPENMP - "Enable OpenMP support in Kokkos." - "${${PROJECT_NAME}_ENABLE_OpenMP}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_QTHREAD - KOKKOS_HAVE_QTHREADS - "Enable Qthreads support in Kokkos." - "${TPL_ENABLE_QTHREAD}" - ) - -# TODO: No longer an option in Kokkos. Needs to be removed. -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_CXX11 - KOKKOS_HAVE_CXX11 - "Enable C++11 support in Kokkos." - "${${PROJECT_NAME}_ENABLE_CXX11}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_HWLOC - KOKKOS_HAVE_HWLOC - "Enable HWLOC support in Kokkos." - "${TPL_ENABLE_HWLOC}" - ) - -# TODO: This is currently not used in Kokkos. Should it be removed? -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_MPI - KOKKOS_HAVE_MPI - "Enable MPI support in Kokkos." - "${TPL_ENABLE_MPI}" - ) - -# Set default value of Kokkos_ENABLE_Debug_Bounds_Check option -# -# CMake is case sensitive. The Kokkos_ENABLE_Debug_Bounds_Check -# option (defined below) is annoyingly not all caps, but we need to -# keep it that way for backwards compatibility. If users forget and -# try using an all-caps variable, then make it count by using the -# all-caps version as the default value of the original, not-all-caps -# option. Otherwise, the default value of this option comes from -# Kokkos_ENABLE_DEBUG (see Issue #367). - -ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_DEBUG) -IF(DEFINED Kokkos_ENABLE_DEBUG_BOUNDS_CHECK) - IF(Kokkos_ENABLE_DEBUG_BOUNDS_CHECK) - SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT ON) - ELSE() - SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}") - ENDIF() -ELSE() - SET(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT "${${PACKAGE_NAME}_ENABLE_DEBUG}") -ENDIF() -ASSERT_DEFINED(Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Debug_Bounds_Check - KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - "Enable Kokkos::View run-time bounds checking." - "${Kokkos_ENABLE_Debug_Bounds_Check_DEFAULT}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Debug_DualView_Modify_Check - KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK - "Enable abort when Kokkos::DualView modified on host and device without sync." - "${Kokkos_ENABLE_DEBUG}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Profiling - KOKKOS_ENABLE_PROFILING - "Enable KokkosP profiling support for kernel data collections." - "${TPL_ENABLE_DLlib}" - ) - -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Profiling_Load_Print - KOKKOS_ENABLE_PROFILING_LOAD_PRINT - "Print to standard output which profiling library was loaded." - OFF - ) - -# placeholder for future device... -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Winthread - KOKKOS_HAVE_WINTHREAD - "Enable Winthread support in Kokkos." - "${TPL_ENABLE_Winthread}" - ) - -# TODO: No longer an option in Kokkos. Needs to be removed. -# use new/old View -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_USING_DEPRECATED_VIEW - KOKKOS_USING_DEPRECATED_VIEW - "Choose whether to use the old, deprecated Kokkos::View" - OFF - ) + include(${CMAKE_CURRENT_BINARY_DIR}/kokkos_generated_settings.cmake) + # Sources come from makefile-generated kokkos_generated_settings.cmake file + # Enable using the individual sources if needed + set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC}) +endif () #------------------------------------------------------------------------------ @@ -226,10 +122,6 @@ TRIBITS_PACKAGE_DEF() TRIBITS_EXCLUDE_AUTOTOOLS_FILES() -TRIBITS_EXCLUDE_FILES( - classic/doc - classic/LinAlg/doc/CrsRefactorNotesMay2012 - ) - TRIBITS_PACKAGE_POSTPROCESS() + ENDIF() diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 4641232a1f..4315b009d5 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -28,33 +28,39 @@ KOKKOS_OPTIONS ?= "" # Options: force_uvm,use_ldg,rdc,enable_lambda KOKKOS_CUDA_OPTIONS ?= "enable_lambda" +# Return a 1 if a string contains a substring and 0 if not +# Note the search string should be without '"' +# Example: $(call kokkos_has_string,"hwloc,librt",hwloc) +# Will return a 1 +kokkos_has_string=$(if $(findstring $2,$1),1,0) + # Check for general settings. -KOKKOS_INTERNAL_ENABLE_DEBUG := $(strip $(shell echo $(KOKKOS_DEBUG) | grep "yes" | wc -l)) -KOKKOS_INTERNAL_ENABLE_CXX11 := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++11" | wc -l)) -KOKKOS_INTERNAL_ENABLE_CXX1Z := $(strip $(shell echo $(KOKKOS_CXX_STANDARD) | grep "c++1z" | wc -l)) +KOKKOS_INTERNAL_ENABLE_DEBUG := $(call kokkos_has_string,$(KOKKOS_DEBUG),yes) +KOKKOS_INTERNAL_ENABLE_CXX11 := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++11) +KOKKOS_INTERNAL_ENABLE_CXX1Z := $(call kokkos_has_string,$(KOKKOS_CXX_STANDARD),c++1z) # Check for external libraries. -KOKKOS_INTERNAL_USE_HWLOC := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "hwloc" | wc -l)) -KOKKOS_INTERNAL_USE_LIBRT := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "librt" | wc -l)) -KOKKOS_INTERNAL_USE_MEMKIND := $(strip $(shell echo $(KOKKOS_USE_TPLS) | grep "experimental_memkind" | wc -l)) +KOKKOS_INTERNAL_USE_HWLOC := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),hwloc) +KOKKOS_INTERNAL_USE_LIBRT := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),librt) +KOKKOS_INTERNAL_USE_MEMKIND := $(call kokkos_has_string,$(KOKKOS_USE_TPLS),experimental_memkind) # Check for advanced settings. -KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "compiler_warnings" | wc -l)) -KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "aggressive_vectorization" | wc -l)) -KOKKOS_INTERNAL_DISABLE_PROFILING := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_profiling" | wc -l)) -KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "disable_dualview_modify_check" | wc -l)) -KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(strip $(shell echo $(KOKKOS_OPTIONS) | grep "enable_profile_load_print" | wc -l)) -KOKKOS_INTERNAL_CUDA_USE_LDG := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "use_ldg" | wc -l)) -KOKKOS_INTERNAL_CUDA_USE_UVM := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "force_uvm" | wc -l)) -KOKKOS_INTERNAL_CUDA_USE_RELOC := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "rdc" | wc -l)) -KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(strip $(shell echo $(KOKKOS_CUDA_OPTIONS) | grep "enable_lambda" | wc -l)) +KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),compiler_warnings) +KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION := $(call kokkos_has_string,$(KOKKOS_OPTIONS),aggressive_vectorization) +KOKKOS_INTERNAL_DISABLE_PROFILING := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_profiling) +KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK := $(call kokkos_has_string,$(KOKKOS_OPTIONS),disable_dualview_modify_check) +KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_profile_load_print) +KOKKOS_INTERNAL_CUDA_USE_LDG := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),use_ldg) +KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),force_uvm) +KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) +KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) # Check for Kokkos Host Execution Spaces one of which must be on. -KOKKOS_INTERNAL_USE_OPENMP := $(strip $(shell echo $(subst OpenMPTarget,,$(KOKKOS_DEVICES)) | grep OpenMP | wc -l)) -KOKKOS_INTERNAL_USE_PTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Pthread | wc -l)) -KOKKOS_INTERNAL_USE_QTHREADS := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Qthreads | wc -l)) -KOKKOS_INTERNAL_USE_SERIAL := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Serial | wc -l)) +KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) +KOKKOS_INTERNAL_USE_PTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Pthread) +KOKKOS_INTERNAL_USE_QTHREADS := $(call kokkos_has_string,$(KOKKOS_DEVICES),Qthreads) +KOKKOS_INTERNAL_USE_SERIAL := $(call kokkos_has_string,$(KOKKOS_DEVICES),Serial) ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0) ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 0) @@ -65,9 +71,9 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 0) endif # Check for other Execution Spaces. -KOKKOS_INTERNAL_USE_CUDA := $(strip $(shell echo $(KOKKOS_DEVICES) | grep Cuda | wc -l)) -KOKKOS_INTERNAL_USE_ROCM := $(strip $(shell echo $(KOKKOS_DEVICES) | grep ROCm | wc -l)) -KOKKOS_INTERNAL_USE_OPENMPTARGET := $(strip $(shell echo $(KOKKOS_DEVICES) | grep OpenMPTarget | wc -l)) +KOKKOS_INTERNAL_USE_CUDA := $(call kokkos_has_string,$(KOKKOS_DEVICES),Cuda) +KOKKOS_INTERNAL_USE_ROCM := $(call kokkos_has_string,$(KOKKOS_DEVICES),ROCm) +KOKKOS_INTERNAL_USE_OPENMPTARGET := $(call kokkos_has_string,$(KOKKOS_DEVICES),OpenMPTarget) ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) @@ -77,25 +83,20 @@ endif # Check OS. KOKKOS_OS := $(strip $(shell uname -s)) -KOKKOS_INTERNAL_OS_CYGWIN := $(strip $(shell uname -s | grep CYGWIN | wc -l)) -KOKKOS_INTERNAL_OS_LINUX := $(strip $(shell uname -s | grep Linux | wc -l)) -KOKKOS_INTERNAL_OS_DARWIN := $(strip $(shell uname -s | grep Darwin | wc -l)) +KOKKOS_INTERNAL_OS_CYGWIN := $(call kokkos_has_string,$(KOKKOS_OS),CYGWIN) +KOKKOS_INTERNAL_OS_LINUX := $(call kokkos_has_string,$(KOKKOS_OS),Linux) +KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin) # Check compiler. -KOKKOS_INTERNAL_COMPILER_INTEL := $(strip $(shell $(CXX) --version 2>&1 | grep "Intel Corporation" | wc -l)) -KOKKOS_INTERNAL_COMPILER_PGI := $(strip $(shell $(CXX) --version 2>&1 | grep PGI | wc -l)) +KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1)) +KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation) +KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI) KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) -KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(CXX) --version 2>&1 | grep nvcc | wc -l)) -ifneq ($(OMPI_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(OMPI_CXX) --version 2>&1 | grep nvcc | wc -l)) -endif -ifneq ($(MPICH_CXX),) - KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell $(MPICH_CXX) --version 2>&1 | grep nvcc | wc -l)) -endif -KOKKOS_INTERNAL_COMPILER_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep clang | wc -l)) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(strip $(shell $(CXX) --version 2>&1 | grep "apple-darwin" | wc -l)) -KOKKOS_INTERNAL_COMPILER_HCC := $(strip $(shell $(CXX) --version 2>&1 | grep HCC | wc -l)) +KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l)) +KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),apple-darwin) +KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 2) KOKKOS_INTERNAL_COMPILER_CLANG = 1 @@ -209,47 +210,48 @@ endif # Check for Kokkos Architecture settings. # Intel based. -KOKKOS_INTERNAL_USE_ARCH_KNC := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNC | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_WSM := $(strip $(shell echo $(KOKKOS_ARCH) | grep WSM | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_SNB := $(strip $(shell echo $(KOKKOS_ARCH) | grep SNB | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_HSW := $(strip $(shell echo $(KOKKOS_ARCH) | grep HSW | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_BDW := $(strip $(shell echo $(KOKKOS_ARCH) | grep BDW | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_SKX := $(strip $(shell echo $(KOKKOS_ARCH) | grep SKX | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_KNL := $(strip $(shell echo $(KOKKOS_ARCH) | grep KNL | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_KNC := $(call kokkos_has_string,$(KOKKOS_ARCH),KNC) +KOKKOS_INTERNAL_USE_ARCH_WSM := $(call kokkos_has_string,$(KOKKOS_ARCH),WSM) +KOKKOS_INTERNAL_USE_ARCH_SNB := $(call kokkos_has_string,$(KOKKOS_ARCH),SNB) +KOKKOS_INTERNAL_USE_ARCH_HSW := $(call kokkos_has_string,$(KOKKOS_ARCH),HSW) +KOKKOS_INTERNAL_USE_ARCH_BDW := $(call kokkos_has_string,$(KOKKOS_ARCH),BDW) +KOKKOS_INTERNAL_USE_ARCH_SKX := $(call kokkos_has_string,$(KOKKOS_ARCH),SKX) +KOKKOS_INTERNAL_USE_ARCH_KNL := $(call kokkos_has_string,$(KOKKOS_ARCH),KNL) # NVIDIA based. NVCC_WRAPPER := $(KOKKOS_PATH)/bin/nvcc_wrapper -KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler30 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler32 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler35 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler37 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell50 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell52 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell53 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal61 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Pascal60 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ - + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ - + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) +KOKKOS_INTERNAL_USE_ARCH_KEPLER30 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler30) +KOKKOS_INTERNAL_USE_ARCH_KEPLER32 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler32) +KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler35) +KOKKOS_INTERNAL_USE_ARCH_KEPLER37 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler37) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell50) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL52 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell52) +KOKKOS_INTERNAL_USE_ARCH_MAXWELL53 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell53) +KOKKOS_INTERNAL_USE_ARCH_PASCAL61 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal61) +KOKKOS_INTERNAL_USE_ARCH_PASCAL60 := $(call kokkos_has_string,$(KOKKOS_ARCH),Pascal60) +KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53)) +#SEK: This seems like a bug to me ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) - KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Maxwell | wc -l)) - KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kepler | wc -l)) - KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ - + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ - + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ - + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ - + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53) | bc)) + KOKKOS_INTERNAL_USE_ARCH_MAXWELL50 := $(call kokkos_has_string,$(KOKKOS_ARCH),Maxwell) + KOKKOS_INTERNAL_USE_ARCH_KEPLER35 := $(call kokkos_has_string,$(KOKKOS_ARCH),Kepler) + KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ + + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER37) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL61) \ + + $(KOKKOS_INTERNAL_USE_ARCH_PASCAL60) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52) \ + + $(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53)) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) @@ -262,43 +264,43 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) endif endif # ARM based. -KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv80 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv81 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(strip $(shell echo $(KOKKOS_ARCH) | grep ARMv8-ThunderX | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_ARMV80 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv80) +KOKKOS_INTERNAL_USE_ARCH_ARMV81 := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv81) +KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX := $(call kokkos_has_string,$(KOKKOS_ARCH),ARMv8-ThunderX) KOKKOS_INTERNAL_USE_ARCH_ARM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_ARMV80)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV81)+$(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX) | bc)) # IBM based. -KOKKOS_INTERNAL_USE_ARCH_BGQ := $(strip $(shell echo $(KOKKOS_ARCH) | grep BGQ | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power7 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power8 | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(strip $(shell echo $(KOKKOS_ARCH) | grep Power9 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_BGQ := $(call kokkos_has_string,$(KOKKOS_ARCH),BGQ) +KOKKOS_INTERNAL_USE_ARCH_POWER7 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power7) +KOKKOS_INTERNAL_USE_ARCH_POWER8 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power8) +KOKKOS_INTERNAL_USE_ARCH_POWER9 := $(call kokkos_has_string,$(KOKKOS_ARCH),Power9) KOKKOS_INTERNAL_USE_ARCH_IBM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BGQ)+$(KOKKOS_INTERNAL_USE_ARCH_POWER7)+$(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc)) # AMD based. -KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(strip $(shell echo $(KOKKOS_ARCH) | grep AMDAVX | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(strip $(shell echo $(KOKKOS_ARCH) | grep Ryzen | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_EPYC := $(strip $(shell echo $(KOKKOS_ARCH) | grep Epyc | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(strip $(shell echo $(KOKKOS_ARCH) | grep Kaveri | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(strip $(shell echo $(KOKKOS_ARCH) | grep Carrizo | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_FIJI := $(strip $(shell echo $(KOKKOS_ARCH) | grep Fiji | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_VEGA := $(strip $(shell echo $(KOKKOS_ARCH) | grep Vega | wc -l)) -KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(strip $(shell echo $(KOKKOS_ARCH) | grep gfx901 | wc -l)) +KOKKOS_INTERNAL_USE_ARCH_AMDAVX := $(call kokkos_has_string,$(KOKKOS_ARCH),AMDAVX) +KOKKOS_INTERNAL_USE_ARCH_RYZEN := $(call kokkos_has_string,$(KOKKOS_ARCH),Ryzen) +KOKKOS_INTERNAL_USE_ARCH_EPYC := $(call kokkos_has_string,$(KOKKOS_ARCH),Epyc) +KOKKOS_INTERNAL_USE_ARCH_KAVERI := $(call kokkos_has_string,$(KOKKOS_ARCH),Kaveri) +KOKKOS_INTERNAL_USE_ARCH_CARRIZO := $(call kokkos_has_string,$(KOKKOS_ARCH),Carrizo) +KOKKOS_INTERNAL_USE_ARCH_FIJI := $(call kokkos_has_string,$(KOKKOS_ARCH),Fiji) +KOKKOS_INTERNAL_USE_ARCH_VEGA := $(call kokkos_has_string,$(KOKKOS_ARCH),Vega) +KOKKOS_INTERNAL_USE_ARCH_GFX901 := $(call kokkos_has_string,$(KOKKOS_ARCH),gfx901) # Any AVX? -KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_AMDAVX) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNL) | bc )) -KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) +KOKKOS_INTERNAL_USE_ARCH_SSE42 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM)) +KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) +KOKKOS_INTERNAL_USE_ARCH_AVX2 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW)) +KOKKOS_INTERNAL_USE_ARCH_AVX512MIC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNL)) +KOKKOS_INTERNAL_USE_ARCH_AVX512XEON := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Decide what ISA level we are able to support. -KOKKOS_INTERNAL_USE_ISA_X86_64 := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_WSM)+$(KOKKOS_INTERNAL_USE_ARCH_SNB)+$(KOKKOS_INTERNAL_USE_ARCH_HSW)+$(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_KNL)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) -KOKKOS_INTERNAL_USE_ISA_KNC := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_KNC) | bc )) -KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER8)+$(KOKKOS_INTERNAL_USE_ARCH_POWER9) | bc )) -KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_POWER7) | bc )) +KOKKOS_INTERNAL_USE_ISA_X86_64 := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_WSM) + $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_HSW) + $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_KNL) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) +KOKKOS_INTERNAL_USE_ISA_KNC := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KNC)) +KOKKOS_INTERNAL_USE_ISA_POWERPCLE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER8) + $(KOKKOS_INTERNAL_USE_ARCH_POWER9)) +KOKKOS_INTERNAL_USE_ISA_POWERPCBE := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_POWER7)) # Decide whether we can support transactional memory -KOKKOS_INTERNAL_USE_TM := $(strip $(shell echo $(KOKKOS_INTERNAL_USE_ARCH_BDW)+$(KOKKOS_INTERNAL_USE_ARCH_SKX) | bc )) +KOKKOS_INTERNAL_USE_TM := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_BDW) + $(KOKKOS_INTERNAL_USE_ARCH_SKX)) # Incompatible flags? KOKKOS_INTERNAL_USE_ARCH_MULTIHOST := $(strip $(shell echo "$(KOKKOS_INTERNAL_USE_ARCH_SSE42)+$(KOKKOS_INTERNAL_USE_ARCH_AVX)+$(KOKKOS_INTERNAL_USE_ARCH_AVX2)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC)+$(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON)+$(KOKKOS_INTERNAL_USE_ARCH_KNC)+$(KOKKOS_INTERNAL_USE_ARCH_IBM)+$(KOKKOS_INTERNAL_USE_ARCH_ARM)>1" | bc )) @@ -320,94 +322,100 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_COMPILER_WARNINGS) endif -KOKKOS_LIBS = -lkokkos -ldl +KOKKOS_LIBS = -ldl KOKKOS_LDFLAGS = -L$(shell pwd) KOKKOS_SRC = KOKKOS_HEADERS = # Generating the KokkosCore_config.h file. +KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp +KOKKOS_CONFIG_HEADER=KokkosCore_config.h +# Functions for generating config header file +kokkos_append_header = $(shell echo $1 >> $(KOKKOS_INTERNAL_CONFIG_TMP)) + +# Do not append first line tmp := $(shell echo "/* ---------------------------------------------" > KokkosCore_config.tmp) -tmp := $(shell echo "Makefile constructed configuration:" >> KokkosCore_config.tmp) -tmp := $(shell date >> KokkosCore_config.tmp) -tmp := $(shell echo "----------------------------------------------*/" >> KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,"Makefile constructed configuration:") +tmp := $(call kokkos_append_header,"$(shell date)") +tmp := $(call kokkos_append_header,"----------------------------------------------*/") -tmp := $(shell echo '\#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)' >> KokkosCore_config.tmp) -tmp := $(shell echo '\#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead."' >> KokkosCore_config.tmp) -tmp := $(shell echo '\#else' >> KokkosCore_config.tmp) -tmp := $(shell echo '\#define KOKKOS_CORE_CONFIG_H' >> KokkosCore_config.tmp) -tmp := $(shell echo '\#endif' >> KokkosCore_config.tmp) - -tmp := $(shell echo "/* Execution Spaces */" >> KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,'\#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H)') +tmp := $(call kokkos_append_header,'\#error "Do not include $(KOKKOS_CONFIG_HEADER) directly; include Kokkos_Macros.hpp instead."') +tmp := $(call kokkos_append_header,'\#else') +tmp := $(call kokkos_append_header,'\#define KOKKOS_CORE_CONFIG_H') +tmp := $(call kokkos_append_header,'\#endif') + +tmp := $(call kokkos_append_header,"/* Execution Spaces */") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - tmp := $(shell echo "\#define KOKKOS_HAVE_CUDA 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CUDA") endif ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) - tmp := $(shell echo '\#define KOKKOS_ENABLE_ROCM 1' >> KokkosCore_config.tmp) + tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_ROCM') endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - tmp := $(shell echo '\#define KOKKOS_ENABLE_OPENMPTARGET 1' >> KokkosCore_config.tmp) + tmp := $(call kokkos_append_header,'\#define KOKKOS_ENABLE_OPENMPTARGET') endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(shell echo '\#define KOKKOS_HAVE_OPENMP 1' >> KokkosCore_config.tmp) + tmp := $(call kokkos_append_header,'\#define KOKKOS_HAVE_OPENMP') endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - tmp := $(shell echo "\#define KOKKOS_HAVE_PTHREAD 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_PTHREAD") endif ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1) - tmp := $(shell echo "\#define KOKKOS_HAVE_QTHREADS 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_QTHREADS") endif ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(shell echo "\#define KOKKOS_HAVE_SERIAL 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_SERIAL") endif ifeq ($(KOKKOS_INTERNAL_USE_TM), 1) - tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ENABLE_TM" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TM") + tmp := $(call kokkos_append_header,"\#endif") endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_X86_64), 1) - tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_USE_ISA_X86_64" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_X86_64") + tmp := $(call kokkos_append_header,"\#endif") endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_KNC), 1) - tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_USE_ISA_KNC" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_KNC") + tmp := $(call kokkos_append_header,"\#endif") endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCLE), 1) - tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCLE" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCLE") + tmp := $(call kokkos_append_header,"\#endif") endif ifeq ($(KOKKOS_INTERNAL_USE_ISA_POWERPCBE), 1) - tmp := $(shell echo "\#ifndef __CUDA_ARCH__" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_USE_ISA_POWERPCBE" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#endif" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#ifndef __CUDA_ARCH__") + tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_ISA_POWERPCBE") + tmp := $(call kokkos_append_header,"\#endif") endif -tmp := $(shell echo "/* General Settings */" >> KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,"/* General Settings */") ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX11), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX11_FLAG) - tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11") endif ifeq ($(KOKKOS_INTERNAL_ENABLE_CXX1Z), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CXX1Z_FLAG) - tmp := $(shell echo "\#define KOKKOS_HAVE_CXX11 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_HAVE_CXX1Z 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX11") + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_CXX1Z") endif ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) @@ -417,26 +425,26 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_DEBUG), 1) KOKKOS_CXXFLAGS += -g KOKKOS_LDFLAGS += -g -ldl - tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_HAVE_DEBUG 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK") + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_DEBUG") ifeq ($(KOKKOS_INTERNAL_DISABLE_DUALVIEW_MODIFY_CHECK), 0) - tmp := $(shell echo "\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK") endif endif ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1) - tmp := $(shell echo "\#define KOKKOS_ENABLE_PROFILING_LOAD_PRINT 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING_LOAD_PRINT") endif ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) KOKKOS_CPPFLAGS += -I$(HWLOC_PATH)/include KOKKOS_LDFLAGS += -L$(HWLOC_PATH)/lib KOKKOS_LIBS += -lhwloc - tmp := $(shell echo "\#define KOKKOS_HAVE_HWLOC 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HWLOC") endif ifeq ($(KOKKOS_INTERNAL_USE_LIBRT), 1) - tmp := $(shell echo "\#define KOKKOS_USE_LIBRT 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_USE_LIBRT") KOKKOS_LIBS += -lrt endif @@ -444,36 +452,36 @@ ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) KOKKOS_CPPFLAGS += -I$(MEMKIND_PATH)/include KOKKOS_LDFLAGS += -L$(MEMKIND_PATH)/lib KOKKOS_LIBS += -lmemkind -lnuma - tmp := $(shell echo "\#define KOKKOS_HAVE_HBWSPACE 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_HAVE_HBWSPACE") endif ifeq ($(KOKKOS_INTERNAL_DISABLE_PROFILING), 0) - tmp := $(shell echo "\#define KOKKOS_ENABLE_PROFILING" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_PROFILING") endif -tmp := $(shell echo "/* Optimization Settings */" >> KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,"/* Optimization Settings */") ifeq ($(KOKKOS_INTERNAL_OPT_RANGE_AGGRESSIVE_VECTORIZATION), 1) - tmp := $(shell echo "\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION") endif -tmp := $(shell echo "/* Cuda Settings */" >> KokkosCore_config.tmp) +tmp := $(call kokkos_append_header,"/* Cuda Settings */") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LDG), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC") else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LDG_INTRINSIC 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LDG_INTRINSIC") endif endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_UVM 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_UVM") endif ifeq ($(KOKKOS_INTERNAL_CUDA_USE_RELOC), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_RELOCATABLE_DEVICE_CODE") KOKKOS_CXXFLAGS += --relocatable-device-code=true KOKKOS_LDFLAGS += --relocatable-device-code=true endif @@ -481,7 +489,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) ifeq ($(KOKKOS_INTERNAL_CUDA_USE_LAMBDA), 1) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) ifeq ($(shell test $(KOKKOS_INTERNAL_COMPILER_NVCC_VERSION) -gt 70; echo $$?),0) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA") KOKKOS_CXXFLAGS += -expt-extended-lambda else $(warning Warning: Cuda Lambda support was requested but NVCC version is too low. This requires NVCC for Cuda version 7.5 or higher. Disabling Lambda support now.) @@ -489,19 +497,19 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_USE_LAMBDA 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_USE_LAMBDA") endif endif ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - tmp := $(shell echo "\#define KOKKOS_CUDA_CLANG_WORKAROUND" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_CUDA_CLANG_WORKAROUND") endif endif # Add Architecture flags. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -518,7 +526,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV80), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV81 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV81") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -535,8 +543,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV81), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV80 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_ARMV8_THUNDERX 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV80") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ARMV8_THUNDERX") ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY), 1) KOKKOS_CXXFLAGS += @@ -553,7 +561,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ARMV8_THUNDERX), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_SSE42 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_SSE42") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -xSSE4.2 @@ -575,7 +583,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_SSE42), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -mavx @@ -597,7 +605,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_POWER7 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER7") ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) @@ -609,7 +617,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER7), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_POWER8 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER8") ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) @@ -630,7 +638,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER8), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_POWER9 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_POWER9") ifeq ($(KOKKOS_INTERNAL_COMPILER_PGI), 1) @@ -651,7 +659,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_POWER9), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -xCORE-AVX2 @@ -673,7 +681,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HSW), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX2 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX2") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -xCORE-AVX2 @@ -695,7 +703,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_BDW), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512MIC 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512MIC") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -xMIC-AVX512 @@ -716,7 +724,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512MIC), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_AVX512XEON 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_AVX512XEON") ifeq ($(KOKKOS_INTERNAL_COMPILER_INTEL), 1) KOKKOS_CXXFLAGS += -xCORE-AVX512 @@ -737,7 +745,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AVX512XEON), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KNC), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_KNC 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KNC") KOKKOS_CXXFLAGS += -mmic KOKKOS_LDFLAGS += -mmic endif @@ -753,48 +761,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER30 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER30") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER32 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER32") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER35 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER35") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_KEPLER37 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KEPLER37") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL50 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL50") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL52 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL52") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_MAXWELL53 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_MAXWELL53") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL60 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL60") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL 1" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_PASCAL61 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_PASCAL61") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif @@ -811,28 +819,28 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) # Lets start with adding architecture defines ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KAVERI), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 701" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_KAVERI 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 701") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_KAVERI") KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx701 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_CARRIZO), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 801" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_CARRIZO 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 801") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_CARRIZO") KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx801 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_FIJI), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 803" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_FIJI 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 803") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_FIJI") KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx803 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VEGA), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 900" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_VEGA 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 900") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_VEGA") KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx900 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_GFX901), 1) - tmp := $(shell echo "\#define KOKKOS_ARCH_ROCM 901" >> KokkosCore_config.tmp ) - tmp := $(shell echo "\#define KOKKOS_ARCH_GFX901 1" >> KokkosCore_config.tmp ) + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_ROCM 901") + tmp := $(call kokkos_append_header,"\#define KOKKOS_ARCH_GFX901") KOKKOS_INTERNAL_ROCM_ARCH_FLAG := --amdgpu-target=gfx901 endif @@ -952,6 +960,10 @@ ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) KOKKOS_CXXFLAGS += -U__STRICT_ANSI__ endif +# Set KokkosExtraLibs and add -lkokkos to link line +KOKKOS_EXTRA_LIBS := ${KOKKOS_LIBS} +KOKKOS_LIBS := -lkokkos ${KOKKOS_LIBS} + # Setting up dependencies. KokkosCore_config.h: diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 964ec966d5..a63598577c 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -22,8 +22,8 @@ Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokk $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_Spinwait.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Spinwait.cpp -Kokkos_Rendezvous.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Rendezvous.cpp +Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp Kokkos_Profiling_Interface.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_Profiling_Interface.cpp Kokkos_SharedAlloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_SharedAlloc.cpp diff --git a/lib/kokkos/README b/lib/kokkos/README index e5ed39ef53..d6c66634dd 100644 --- a/lib/kokkos/README +++ b/lib/kokkos/README @@ -41,48 +41,44 @@ hcedwar(at)sandia.gov and crtrott(at)sandia.gov ============================================================================ Primary tested compilers on X86 are: - GCC 4.7.2 GCC 4.8.4 - GCC 4.9.2 + GCC 4.9.3 GCC 5.1.0 - GCC 5.2.0 - Intel 14.0.4 + GCC 5.3.0 + GCC 6.1.0 Intel 15.0.2 Intel 16.0.1 - Intel 17.0.098 - Intel 17.1.132 + Intel 17.1.043 + Intel 17.4.196 + Intel 18.0.128 Clang 3.5.2 Clang 3.6.1 Clang 3.7.1 Clang 3.8.1 Clang 3.9.0 - PGI 17.1 + Clang 4.0.0 + Clang 4.0.0 for CUDA (CUDA Toolkit 8.0.44) + PGI 17.10 + NVCC 7.0 for CUDA (with gcc 4.8.4) + NVCC 7.5 for CUDA (with gcc 4.8.4) + NVCC 8.0.44 for CUDA (with gcc 5.3.0) Primary tested compilers on Power 8 are: GCC 5.4.0 (OpenMP,Serial) - IBM XL 13.1.3 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug) + IBM XL 13.1.5 (OpenMP, Serial) (There is a workaround in place to avoid a compiler bug) + NVCC 8.0.44 for CUDA (with gcc 5.4.0) + NVCC 9.0.103 for CUDA (with gcc 6.3.0) Primary tested compilers on Intel KNL are: GCC 6.2.0 - Intel 16.2.181 (with gcc 4.7.2) - Intel 17.0.098 (with gcc 4.7.2) - Intel 17.1.132 (with gcc 4.9.3) + Intel 16.4.258 (with gcc 4.7.2) Intel 17.2.174 (with gcc 4.9.3) - Intel 18.0.061 (beta) (with gcc 4.9.3) - -Secondary tested compilers are: - CUDA 7.0 (with gcc 4.8.4) - CUDA 7.5 (with gcc 4.8.4) - CUDA 8.0 (with gcc 5.3.0 on X86 and gcc 5.4.0 on Power8) - CUDA/Clang 8.0 using Clang/Trunk compiler + Intel 18.0.128 (with gcc 4.9.3) Other compilers working: X86: Cygwin 2.1.0 64bit with gcc 4.9.3 -Limited testing of the following compilers on POWER7+ systems: - GCC 4.8.5 (on RHEL7.1 POWER7+) - Known non-working combinations: Power8: Pthreads backend @@ -96,8 +92,8 @@ GCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wignored-qualifiers -Wempty-body -Wclobbered -Wuninitialized Intel: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized Clang: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized +NVCC: -Wall -Wshadow -pedantic -Werror -Wsign-compare -Wtype-limits -Wuninitialized -Secondary compilers are passing without -Werror. Other compilers are tested occasionally, in particular when pushing from develop to master branch, without -Werror and only for a select set of backends. diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index 7853184a54..507c9f2fdb 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -2,7 +2,9 @@ TRIBITS_SUBPACKAGE(Algorithms) -ADD_SUBDIRECTORY(src) +IF(KOKKOS_HAS_TRILINOS) + ADD_SUBDIRECTORY(src) +ENDIF() TRIBITS_ADD_TEST_DIRECTORIES(unit_tests) #TRIBITS_ADD_TEST_DIRECTORIES(performance_tests) diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index fde6b967e0..f5aa24e9be 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -3,6 +3,32 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +IF(NOT KOKKOS_HAS_TRILINOS) + IF(KOKKOS_SEPARATE_LIBS) + set(TEST_LINK_TARGETS kokkoscore) + ELSE() + set(TEST_LINK_TARGETS kokkos) + ENDIF() +ENDIF() + +SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) +INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) + +# mfh 03 Nov 2017: The gtest library used here must have a different +# name than that of the gtest library built in KokkosCore. We can't +# just refer to the library in KokkosCore's tests, because it's +# possible to build only (e.g.,) KokkosAlgorithms tests, without +# building KokkosCore tests. + +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGTEST_HAS_PTHREAD=0") + +TRIBITS_ADD_LIBRARY( + kokkosalgorithms_gtest + HEADERS ${GTEST_SOURCE_DIR}/gtest/gtest.h + SOURCES ${GTEST_SOURCE_DIR}/gtest/gtest-all.cc + TESTONLY + ) + SET(SOURCES UnitTestMain.cpp TestCuda.cpp @@ -34,5 +60,5 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkosalgorithms_gtest ${TEST_LINK_TARGETS} ) diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index a5a10c82ee..b5848c451e 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -15,7 +15,8 @@ endif CXXFLAGS = -O3 LINK ?= $(CXX) -LDFLAGS ?= -lpthread +LDFLAGS ?= +override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 04be98f1cc..2cb0b89712 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -211,12 +211,15 @@ void test_dynamic_view_sort(unsigned int n ) const size_t upper_bound = 2 * n ; + const size_t total_alloc_size = n * sizeof(KeyType) * 1.2 ; + const size_t superblock_size = std::min(total_alloc_size, size_t(1000000)); + typename KeyDynamicViewType::memory_pool pool( memory_space() , n * sizeof(KeyType) * 1.2 , 500 /* min block size in bytes */ , 30000 /* max block size in bytes */ - , 1000000 /* min superblock size in bytes */ + , superblock_size ); KeyDynamicViewType keys("Keys",pool,upper_bound); @@ -271,8 +274,10 @@ void test_sort(unsigned int N) { test_1D_sort(N*N*N, true); test_1D_sort(N*N*N, false); +#if !defined(KOKKOS_ENABLE_ROCM) test_3D_sort(N); test_dynamic_view_sort(N*N); +#endif } } diff --git a/lib/kokkos/benchmarks/atomic/Makefile b/lib/kokkos/benchmarks/atomic/Makefile new file mode 100644 index 0000000000..41875ee5e4 --- /dev/null +++ b/lib/kokkos/benchmarks/atomic/Makefile @@ -0,0 +1,44 @@ +KOKKOS_PATH = ${HOME}/kokkos +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +EXE_NAME = "test" + +SRC = $(wildcard *.cpp) + +default: build + echo "Start Build" + + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/config/nvcc_wrapper +EXE = ${EXE_NAME}.cuda +KOKKOS_CUDA_OPTIONS = "enable_lambda" +else +CXX = g++ +EXE = ${EXE_NAME}.host +endif + +CXXFLAGS = -O3 + +LINK = ${CXX} +LINKFLAGS = -O3 + +DEPFLAGS = -M + +OBJ = $(SRC:.cpp=.o) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< diff --git a/lib/kokkos/benchmarks/atomic/main.cpp b/lib/kokkos/benchmarks/atomic/main.cpp new file mode 100644 index 0000000000..d86d196249 --- /dev/null +++ b/lib/kokkos/benchmarks/atomic/main.cpp @@ -0,0 +1,124 @@ +#include +#include +#include + +template +double test_atomic(int L, int N, int M,int K,int R,Kokkos::View offsets) { + Kokkos::View output("Output",N); + Kokkos::Impl::Timer timer; + + for(int r = 0; r +double test_no_atomic(int L, int N, int M,int K,int R,Kokkos::View offsets) { + Kokkos::View output("Output",N); + Kokkos::Impl::Timer timer; + for(int r = 0; r\n"); + printf("Example Input GPU:\n"); + printf(" Histogram : 1000000 1000 1 1000 1 10 1\n"); + printf(" MD Force : 100000 100000 100 1000 20 10 4\n"); + printf(" Matrix Assembly : 100000 1000000 50 1000 20 10 4\n"); + Kokkos::finalize(); + return 0; + } + + + int L = atoi(argv[1]); + int N = atoi(argv[2]); + int M = atoi(argv[3]); + int D = atoi(argv[4]); + int K = atoi(argv[5]); + int R = atoi(argv[6]); + int type = atoi(argv[7]); + + Kokkos::View offsets("Offsets",L,M); + Kokkos::Random_XorShift64_Pool<> pool(12371); + Kokkos::fill_random(offsets,pool,D); + double time = 0; + if(type==1) + time = test_atomic(L,N,M,K,R,offsets); + if(type==2) + time = test_atomic(L,N,M,K,R,offsets); + if(type==3) + time = test_atomic(L,N,M,K,R,offsets); + if(type==4) + time = test_atomic(L,N,M,K,R,offsets); + if(type==5) + time = test_atomic >(L,N,M,K,R,offsets); + + double time2 = 1; + if(type==1) + time2 = test_no_atomic(L,N,M,K,R,offsets); + if(type==2) + time2 = test_no_atomic(L,N,M,K,R,offsets); + if(type==3) + time2 = test_no_atomic(L,N,M,K,R,offsets); + if(type==4) + time2 = test_no_atomic(L,N,M,K,R,offsets); + if(type==5) + time2 = test_no_atomic >(L,N,M,K,R,offsets); + + int size = 0; + if(type==1) size = sizeof(int); + if(type==2) size = sizeof(long); + if(type==3) size = sizeof(float); + if(type==4) size = sizeof(double); + if(type==5) size = sizeof(Kokkos::complex); + + printf("%i\n",size); + printf("Time: %s %i %i %i %i %i %i (t_atomic: %e t_nonatomic: %e ratio: %lf )( GUpdates/s: %lf GB/s: %lf )\n", + (type==1)?"int": ( + (type==2)?"long": ( + (type==3)?"float": ( + (type==4)?"double":"complex"))), + L,N,M,D,K,R,time,time2,time/time2, + 1.e-9*L*R*M/time, 1.0*L*R*M*2*size/time/1024/1024/1024); +} + Kokkos::finalize(); +} + diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash new file mode 100755 index 0000000000..0b885293e2 --- /dev/null +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/build_code.bash @@ -0,0 +1,84 @@ +#!/bin/bash + +# ---- Default Settings ----- + +# Paths +KOKKOS_PATH=${PWD}/kokkos +KOKKOS_KERNELS_PATH=${PWD}/kokkos-kernels +MINIMD_PATH=${PWD}/miniMD/kokkos +MINIFE_PATH=${PWD}/miniFE/kokkos + +# Kokkos Configure Options +KOKKOS_DEVICES=OpenMP +KOKKOS_ARCH=SNB + +# Compiler Options +CXX=mpicxx +OPT_FLAG="-O3" + +while [[ $# > 0 ]] +do + key="$1" + + case $key in + --kokkos-path*) + KOKKOS_PATH="${key#*=}" + ;; + --kokkos-kernels-path*) + KOKKOS_KERNELS_PATH="${key#*=}" + ;; + --minimd-path*) + MINIMD_PATH="${key#*=}" + ;; + --minife-path*) + MINIFE_PATH="${key#*=}" + ;; + --device-list*) + KOKKOS_DEVICES="${key#*=}" + ;; + --arch*) + KOKKOS_ARCH="--arch=${key#*=}" + ;; + --opt-flag*) + OPT_FLAG="${key#*=}" + ;; + --compiler*) + CXX="${key#*=}" + ;; + --with-cuda-options*) + KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" + ;; + --help*) + PRINT_HELP=True + ;; + *) + # args, just append + ARGS="$ARGS $1" + ;; + esac + + shift +done + +mkdir build + +# Build BytesAndFlops +mkdir build/bytes_and_flops +cd build/bytes_and_flops +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH}\ + CXXFLAGS=${OPT_FLAG} -f ${KOKKOS_PATH}/benchmarks/bytes_and_flops/Makefile -j 16 +cd ../.. + +mkdir build/miniMD +cd build/miniMD +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \ + CXXFLAGS=${OPT_FLAG} -f ${MINIMD_PATH}/Makefile -j 16 +cd ../../ + +mkdir build/miniFE +cd build/miniFE +make KOKKOS_ARCH=${KOKKOS_ARCH} KOKKOS_DEVICES=${KOKKOS_DEVICES} CXX=${CXX} KOKKOS_PATH=${KOKKOS_PATH} \ + CXXFLAGS=${OPT_FLAG} -f ${MINIFE_PATH}/src/Makefile -j 16 +cd ../../ + + diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash new file mode 100755 index 0000000000..9b52a36d89 --- /dev/null +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/checkout_repos.bash @@ -0,0 +1,37 @@ +#!/bin/bash + +# Kokkos +if [ ! -d "kokkos" ]; then + git clone https://github.com/kokkos/kokkos +fi +cd kokkos +git checkout develop +git pull +cd .. + +# KokkosKernels +if [ ! -d "kokkos-kernels" ]; then +git clone https://github.com/kokkos/kokkos-kernels +fi +cd kokkos-kernels +git pull +cd .. + +# MiniMD +if [ ! -d "miniMD" ]; then + git clone https://github.com/mantevo/miniMD +fi +cd miniMD +git pull +cd .. + +# MiniFE +if [ ! -d "miniFE" ]; then + git clone https://github.com/mantevo/miniFE +fi +cd miniFE +git pull +cd .. + + + diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash new file mode 100755 index 0000000000..6afa05f5fc --- /dev/null +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_benchmark.bash @@ -0,0 +1,14 @@ +#!/bin/bash +SCRIPT_PATH=$1 +KOKKOS_DEVICES=$2 +KOKKOS_ARCH=$3 +COMPILER=$4 +if [[ $# < 4 ]]; then + echo "Usage: ./run_benchmark.bash PATH_TO_SCRIPTS KOKKOS_DEVICES KOKKOS_ARCH COMPILER" +else + +${SCRIPT_PATH}/checkout_repos.bash +${SCRIPT_PATH}/build_code.bash --arch=${KOKKOS_ARCH} --device-list=${KOKKOS_DEVICES} --compiler=${COMPILER} +${SCRIPT_PATH}/run_tests.bash + +fi \ No newline at end of file diff --git a/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash new file mode 100755 index 0000000000..63aaca9e40 --- /dev/null +++ b/lib/kokkos/benchmarks/benchmark_suite/scripts/run_tests.bash @@ -0,0 +1,44 @@ +#!/bin/bash + +# BytesAndFlops +cd build/bytes_and_flops + +USE_CUDA=`grep "_CUDA 1" KokkosCore_config.h | wc -l` + +if [[ ${USE_CUDA} > 0 ]]; then + BAF_EXE=bytes_and_flops.cuda + TEAM_SIZE=256 +else + BAF_EXE=bytes_and_flops.host + TEAM_SIZE=1 +fi + +BAF_PERF_1=`./${BAF_EXE} 2 100000 1024 1 1 1 1 ${TEAM_SIZE} 6000 | awk '{print $12/174.5}'` +BAF_PERF_2=`./${BAF_EXE} 2 100000 1024 16 1 8 64 ${TEAM_SIZE} 6000 | awk '{print $14/1142.65}'` + +echo "BytesAndFlops: ${BAF_PERF_1} ${BAF_PERF_2}" +cd ../.. + + +# MiniMD +cd build/miniMD +cp ../../miniMD/kokkos/Cu_u6.eam ./ +MD_PERF_1=`./miniMD --half_neigh 0 -s 60 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/21163341}'` +MD_PERF_2=`./miniMD --half_neigh 0 -s 20 --ntypes 1 -t ${OMP_NUM_THREADS} -i ../../miniMD/kokkos/in.eam.miniMD | grep PERF_SUMMARY | awk '{print $10/13393417}'` + +echo "MiniMD: ${MD_PERF_1} ${MD_PERF_2}" +cd ../.. + +# MiniFE +cd build/miniFE +rm *.yaml +./miniFE.x -nx 100 &> /dev/null +FE_PERF_1=`grep "CG Mflop" *.yaml | awk '{print $4/14174}'` +rm *.yaml +./miniFE.x -nx 50 &> /dev/null +FE_PERF_2=`grep "CG Mflop" *.yaml | awk '{print $4/11897}'` +cd ../.. +echo "MiniFE: ${FE_PERF_1} ${FE_PERF_2}" + +PERF_RESULT=`echo "${BAF_PERF_1} ${BAF_PERF_2} ${MD_PERF_1} ${MD_PERF_2} ${FE_PERF_1} ${FE_PERF_2}" | awk '{print ($1+$2+$3+$4+$5+$6)/6}'` +echo "Total Result: " ${PERF_RESULT} \ No newline at end of file diff --git a/lib/kokkos/benchmarks/bytes_and_flops/Makefile b/lib/kokkos/benchmarks/bytes_and_flops/Makefile index 5ddf78f28e..6cbef56ff0 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/Makefile +++ b/lib/kokkos/benchmarks/bytes_and_flops/Makefile @@ -1,7 +1,18 @@ -KOKKOS_PATH = ${HOME}/kokkos -SRC = $(wildcard *.cpp) KOKKOS_DEVICES=Cuda KOKKOS_CUDA_OPTIONS=enable_lambda +KOKKOS_ARCH = "SNB,Kepler35" + + +MAKEFILE_PATH := $(subst Makefile,,$(abspath $(lastword $(MAKEFILE_LIST)))) + +ifndef KOKKOS_PATH + KOKKOS_PATH = $(MAKEFILE_PATH)../.. +endif + +SRC = $(wildcard $(MAKEFILE_PATH)*.cpp) +HEADERS = $(wildcard $(MAKEFILE_PATH)*.hpp) + +vpath %.cpp $(sort $(dir $(SRC))) default: build echo "Start Build" @@ -9,22 +20,19 @@ default: build ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper EXE = bytes_and_flops.cuda -KOKKOS_DEVICES = "Cuda,OpenMP" -KOKKOS_ARCH = "SNB,Kepler35" else CXX = g++ EXE = bytes_and_flops.host -KOKKOS_DEVICES = "OpenMP" -KOKKOS_ARCH = "SNB" endif -CXXFLAGS = -O3 -g +CXXFLAGS ?= -O3 -g +override CXXFLAGS += -I$(MAKEFILE_PATH) DEPFLAGS = -M LINK = ${CXX} LINKFLAGS = -OBJ = $(SRC:.cpp=.o) +OBJ = $(notdir $(SRC:.cpp=.o)) LIB = include $(KOKKOS_PATH)/Makefile.kokkos @@ -39,5 +47,5 @@ clean: kokkos-clean # Compilation rules -%.o:%.cpp $(KOKKOS_CPP_DEPENDS) bench.hpp bench_unroll_stride.hpp bench_stride.hpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) $(HEADERS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp index 8c79f3b88d..11576413e2 100644 --- a/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp +++ b/lib/kokkos/benchmarks/policy_performance/policy_perf_test.hpp @@ -69,11 +69,11 @@ void test_policy(int team_range, int thread_range, int vector_range, int team_size, int vector_size, int test_type, ViewType1 &v1, ViewType2 &v2, ViewType3 &v3, double &result, double &result_expect, double &time) { - + typedef Kokkos::TeamPolicy t_policy; typedef typename t_policy::member_type t_team; Kokkos::Timer timer; - + for(int orep = 0; orep(v1) #if 0 // This does not compile with pre Cuda 8.0 - see Github Issue #913 for explanation diff --git a/lib/kokkos/bin/hpcbind b/lib/kokkos/bin/hpcbind index b88b334f8b..92f9f81ac9 100755 --- a/lib/kokkos/bin/hpcbind +++ b/lib/kokkos/bin/hpcbind @@ -26,6 +26,7 @@ fi # Get parent cpuset HPCBIND_HWLOC_PARENT_CPUSET="" if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then + HPCBIND_HWLOC_VERSION="$(hwloc-ls --version | cut -d ' ' -f 2)" MY_PID="$BASHPID" HPCBIND_HWLOC_PARENT_CPUSET="$(hwloc-ps -a --cpuset | grep ${MY_PID} | cut -f 2)" fi @@ -45,8 +46,11 @@ declare -i NUM_GPUS=0 HPCBIND_VISIBLE_GPUS="" if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then NUM_GPUS=$(nvidia-smi -L | wc -l); - GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" - HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} + HPCBIND_HAS_NVIDIA=$((!$?)) + if [[ ${HPCBIND_HAS_NVIDIA} -eq 1 ]]; then + GPU_LIST="$( seq 0 $((NUM_GPUS-1)) )" + HPCBIND_VISIBLE_GPUS=${CUDA_VISIBLE_DEVICES:-${GPU_LIST}} + fi fi declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) @@ -57,33 +61,38 @@ declare -i HPCBIND_ENABLE_GPU_MAPPING=$((NUM_GPUS > 0)) # supports sbatch, bsub, aprun ################################################################################ HPCBIND_QUEUE_NAME="" -declare -i HPCBIND_QUEUE_INDEX=0 +declare -i HPCBIND_QUEUE_RANK=0 +declare -i HPCBIND_QUEUE_SIZE=0 declare -i HPCBIND_QUEUE_MAPPING=0 if [[ ! -z "${PMI_RANK}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="mpich" - HPCBIND_QUEUE_INDEX=${PMI_RANK} + HPCBIND_QUEUE_RANK=${PMI_RANK} + HPCBIND_QUEUE_SIZE=${PMI_SIZE} elif [[ ! -z "${OMPI_COMM_WORLD_RANK}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="openmpi" - HPCBIND_QUEUE_INDEX=${OMPI_COMM_WORLD_RANK} + HPCBIND_QUEUE_RANK=${OMPI_COMM_WORLD_RANK} + HPCBIND_QUEUE_SIZE=${OMPI_COMM_WORLD_SIZE} elif [[ ! -z "${MV2_COMM_WORLD_RANK}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="mvapich2" - HPCBIND_QUEUE_INDEX=${MV2_COMM_WORLD_RANK} + HPCBIND_QUEUE_RANK=${MV2_COMM_WORLD_RANK} + HPCBIND_QUEUE_SIZE=${MV2_COMM_WORLD_SIZE} elif [[ ! -z "${SLURM_LOCAL_ID}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="slurm" - HPCBIND_QUEUE_INDEX=${SLURM_LOCAL_ID} -elif [[ ! -z "${LBS_JOBINDEX}" ]]; then - HPCBIND_QUEUE_MAPPING=1 - HPCBIND_QUEUE_NAME="bsub" - HPCBIND_QUEUE_INDEX=${LBS_JOBINDEX} + HPCBIND_QUEUE_RANK=${SLURM_PROCID} + HPCBIND_QUEUE_SIZE=${SLURM_NPROCS} elif [[ ! -z "${ALPS_APP_PE}" ]]; then HPCBIND_QUEUE_MAPPING=1 HPCBIND_QUEUE_NAME="aprun" - HPCBIND_QUEUE_INDEX=${ALPS_APP_PE} + HPCBIND_QUEUE_RANK=${ALPS_APP_PE} +elif [[ ! -z "${LBS_JOBINDEX}" ]]; then + HPCBIND_QUEUE_MAPPING=1 + HPCBIND_QUEUE_NAME="bsub" + HPCBIND_QUEUE_RANK=${LBS_JOBINDEX} fi ################################################################################ @@ -113,8 +122,8 @@ function show_help { echo " --no-gpu-mapping Do not set CUDA_VISIBLE_DEVICES" echo " --openmp=M.m Set env variables for the given OpenMP version" echo " Default: 4.0" - echo " --openmp-percent=N Integer percentage of cpuset to use for OpenMP" - echo " threads Default: 100" + echo " --openmp-ratio=N/D Ratio of the cpuset to use for OpenMP" + echo " Default: 1" echo " --openmp-places= Op=threads|cores|sockets. Default: threads" echo " --no-openmp-proc-bind Set OMP_PROC_BIND to false and unset OMP_PLACES" echo " --force-openmp-num-threads=N" @@ -123,8 +132,8 @@ function show_help { echo " Override logic for selecting OMP_PROC_BIND" echo " --no-openmp-nested Set OMP_NESTED to false" echo " --output-prefix=

Save the output to files of the form" - echo " P-N.log, P-N.out and P-N.err where P is the prefix" - echo " and N is the queue index or mpi rank (no spaces)" + echo " P.hpcbind.N, P.stdout.N and P.stderr.N where P is " + echo " the prefix and N is the rank (no spaces)" echo " --output-mode= How console output should be handled." echo " Options are all, rank0, and none. Default: rank0" echo " --lstopo Show bindings in lstopo" @@ -132,20 +141,27 @@ function show_help { echo " -h|--help Show this message" echo "" echo "Sample Usage:" + echo "" echo " Split the current process cpuset into 4 and use the 3rd partition" echo " ${cmd} --distribute=4 --distribute-partition=2 -v -- command ..." + echo "" echo " Launch 16 jobs over 4 nodes with 4 jobs per node using only the even pus" echo " and save the output to rank specific files" echo " mpiexec -N 16 -npernode 4 ${cmd} --whole-system --proc-bind=pu:even \\" echo " --distribute=4 -v --output-prefix=output -- command ..." + echo "" echo " Bind the process to all even cores" echo " ${cmd} --proc-bind=core:even -v -- command ..." + echo "" echo " Bind the the even cores of socket 0 and the odd cores of socket 1" echo " ${cmd} --proc-bind='socket:0.core:even socket:1.core:odd' -v -- command ..." + echo "" echo " Skip GPU 0 when mapping visible devices" echo " ${cmd} --distribute=4 --distribute-partition=0 --visible-gpus=1,2 -v -- command ..." + echo "" echo " Display the current bindings" echo " ${cmd} --proc-bind=numa:0 -- command" + echo "" echo " Display the current bindings using lstopo" echo " ${cmd} --proc-bind=numa:0.core:odd --lstopo" echo "" @@ -167,12 +183,13 @@ declare -i HPCBIND_DISTRIBUTE=1 declare -i HPCBIND_PARTITION=-1 HPCBIND_PROC_BIND="all" HPCBIND_OPENMP_VERSION=4.0 -declare -i HPCBIND_OPENMP_PERCENT=100 +declare -i HPCBIND_OPENMP_RATIO_NUMERATOR=1 +declare -i HPCBIND_OPENMP_RATIO_DENOMINATOR=1 HPCBIND_OPENMP_PLACES=${OMP_PLACES:-threads} declare -i HPCBIND_OPENMP_PROC_BIND=1 -declare -i HPCBIND_OPENMP_FORCE_NUM_THREADS=-1 +HPCBIND_OPENMP_FORCE_NUM_THREADS="" HPCBIND_OPENMP_FORCE_PROC_BIND="" -HPCBIND_OPENMP_NESTED=${OMP_NESTED:-true} +declare -i HPCBIND_OPENMP_NESTED=1 declare -i HPCBIND_VERBOSE=0 declare -i HPCBIND_LSTOPO=0 @@ -199,6 +216,9 @@ for i in "$@"; do ;; --distribute=*) HPCBIND_DISTRIBUTE="${i#*=}" + if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then + HPCBIND_DISTRIBUTE=1 + fi shift ;; # which partition to use @@ -222,8 +242,18 @@ for i in "$@"; do HPCBIND_OPENMP_VERSION="${i#*=}" shift ;; - --openmp-percent=*) - HPCBIND_OPENMP_PERCENT="${i#*=}" + --openmp-ratio=*) + IFS=/ read HPCBIND_OPENMP_RATIO_NUMERATOR HPCBIND_OPENMP_RATIO_DENOMINATOR <<< "${i#*=}" + if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -le 0 ]]; then + HPCBIND_OPENMP_RATIO_NUMERATOR=1 + fi + if [[ ${HPCBIND_OPENMP_RATIO_DENOMINATOR} -le 0 ]]; then + HPCBIND_OPENMP_RATIO_DENOMINATOR=1 + fi + if [[ ${HPCBIND_OPENMP_RATIO_NUMERATOR} -gt ${HPCBIND_OPENMP_RATIO_DENOMINATOR} ]]; then + HPCBIND_OPENMP_RATIO_NUMERATOR=1 + HPCBIND_OPENMP_RATIO_DENOMINATOR=1 + fi shift ;; --openmp-places=*) @@ -243,7 +273,7 @@ for i in "$@"; do shift ;; --no-openmp-nested) - HPCBIND_OPENMP_NESTED="false" + HPCBIND_OPENMP_NESTED=0 shift ;; --output-prefix=*) @@ -292,7 +322,7 @@ if [[ "${HPCBIND_OUTPUT_MODE}" == "none" ]]; then HPCBIND_TEE=0 elif [[ "${HPCBIND_OUTPUT_MODE}" == "all" ]]; then HPCBIND_TEE=1 -elif [[ ${HPCBIND_QUEUE_INDEX} -eq 0 ]]; then +elif [[ ${HPCBIND_QUEUE_RANK} -eq 0 ]]; then #default to rank0 printing to screen HPCBIND_TEE=1 fi @@ -303,9 +333,18 @@ if [[ "${HPCBIND_OUTPUT_PREFIX}" == "" ]]; then HPCBIND_ERR=/dev/null HPCBIND_OUT=/dev/null else - HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.hpc.log" - HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.err" - HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}-${HPCBIND_QUEUE_INDEX}.out" + if [[ ${HPCBIND_QUEUE_SIZE} -gt 0 ]]; then + HPCBIND_STR_QUEUE_SIZE="${HPCBIND_QUEUE_SIZE}" + HPCBIND_STR_QUEUE_RANK=$(printf %0*d ${#HPCBIND_STR_QUEUE_SIZE} ${HPCBIND_QUEUE_RANK}) + + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_STR_QUEUE_RANK}" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_STR_QUEUE_RANK}" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_STR_QUEUE_RANK}" + else + HPCBIND_LOG="${HPCBIND_OUTPUT_PREFIX}.hpcbind.${HPCBIND_QUEUE_RANK}" + HPCBIND_ERR="${HPCBIND_OUTPUT_PREFIX}.stderr.${HPCBIND_QUEUE_RANK}" + HPCBIND_OUT="${HPCBIND_OUTPUT_PREFIX}.stdout.${HPCBIND_QUEUE_RANK}" + fi > ${HPCBIND_LOG} fi @@ -333,27 +372,12 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then NUM_GPUS=${#HPCBIND_VISIBLE_GPUS[@]} fi -################################################################################ -# Check OpenMP percent -################################################################################ -if [[ ${HPCBIND_OPENMP_PERCENT} -lt 1 ]]; then - HPCBIND_OPENMP_PERCENT=1 -elif [[ ${HPCBIND_OPENMP_PERCENT} -gt 100 ]]; then - HPCBIND_OPENMP_PERCENT=100 -fi - -################################################################################ -# Check distribute -################################################################################ -if [[ ${HPCBIND_DISTRIBUTE} -le 0 ]]; then - HPCBIND_DISTRIBUTE=1 -fi ################################################################################ #choose the correct partition ################################################################################ if [[ ${HPCBIND_PARTITION} -lt 0 && ${HPCBIND_QUEUE_MAPPING} -eq 1 ]]; then - HPCBIND_PARTITION=${HPCBIND_QUEUE_INDEX} + HPCBIND_PARTITION=${HPCBIND_QUEUE_RANK} elif [[ ${HPCBIND_PARTITION} -lt 0 ]]; then HPCBIND_PARTITION=0 fi @@ -381,23 +405,40 @@ if [[ ${HPCBIND_ENABLE_HWLOC_BIND} -eq 1 ]]; then else HPCBIND_HWLOC_CPUSET="${BINDING}" fi - HPCBIND_NUM_PUS=$(hwloc-ls --restrict ${HPCBIND_HWLOC_CPUSET} --only pu | wc -l) + HPCBIND_NUM_PUS=$(hwloc-calc -q -N pu ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_PUS=1 + fi + HPCBIND_NUM_CORES=$(hwloc-calc -q -N core ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_CORES=1 + fi + HPCBIND_NUM_NUMAS=$(hwloc-calc -q -N numa ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_NUMAS=1 + fi + HPCBIND_NUM_SOCKETS=$(hwloc-calc -q -N socket ${HPCBIND_HWLOC_CPUSET} ) + if [ $? -ne 0 ]; then + HPCBIND_NUM_SOCKETS=1 + fi else HPCBIND_NUM_PUS=$(cat /proc/cpuinfo | grep -c processor) + HPCBIND_NUM_CORES=${HPCBIND_NUM_PUS} + HPCBIND_NUM_NUMAS=1 + HPCBIND_NUM_SOCKETS=1 fi -declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_PERCENT)) -HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_OPENMP_NUM_THREADS / 100)) - -if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then - HPCBIND_OPENMP_NUM_THREADS=1 -elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then - HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS} -fi - -if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} -gt 0 ]]; then +if [[ ${HPCBIND_OPENMP_FORCE_NUM_THREADS} != "" ]]; then HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_OPENMP_FORCE_NUM_THREADS} +else + declare -i HPCBIND_OPENMP_NUM_THREADS=$((HPCBIND_NUM_PUS * HPCBIND_OPENMP_RATIO_NUMERATOR / HPCBIND_OPENMP_RATIO_DENOMINATOR)) + + if [[ ${HPCBIND_OPENMP_NUM_THREADS} -lt 1 ]]; then + HPCBIND_OPENMP_NUM_THREADS=1 + elif [[ ${HPCBIND_OPENMP_NUM_THREADS} -gt ${HPCBIND_NUM_PUS} ]]; then + HPCBIND_OPENMP_NUM_THREADS=${HPCBIND_NUM_PUS} + fi fi ################################################################################ @@ -405,7 +446,11 @@ fi ################################################################################ # set OMP_NUM_THREADS -export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS} +if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_NUM_THREADS="${HPCBIND_OPENMP_NUM_THREADS},1" +else + export OMP_NUM_THREADS=${HPCBIND_OPENMP_NUM_THREADS} +fi # set OMP_PROC_BIND and OMP_PLACES if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then @@ -413,7 +458,11 @@ if [[ ${HPCBIND_OPENMP_PROC_BIND} -eq 1 ]]; then #default proc bind logic if [[ "${HPCBIND_OPENMP_VERSION}" == "4.0" || "${HPCBIND_OPENMP_VERSION}" > "4.0" ]]; then export OMP_PLACES="${HPCBIND_OPENMP_PLACES}" - export OMP_PROC_BIND="spread" + if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_PROC_BIND="spread,spread" + else + export OMP_PROC_BIND="spread" + fi else export OMP_PROC_BIND="true" unset OMP_PLACES @@ -429,9 +478,17 @@ else unset OMP_PROC_BIND fi -# set OMP_NESTED -export OMP_NESTED=${HPCBIND_OPENMP_NESTED} +# set up hot teams (intel specific) +if [[ ${HPCBIND_OPENMP_NESTED} -eq 1 ]]; then + export OMP_NESTED="true" + export OMP_MAX_ACTIVE_LEVELS=2 + export KMP_HOT_TEAMS=1 + export KMP_HOT_TEAMS_MAX_LEVEL=2 +else + export OMP_NESTED="false" +fi +# set OMP_NESTED ################################################################################ # Set CUDA environment variables @@ -442,7 +499,7 @@ if [[ ${HPCBIND_ENABLE_GPU_MAPPING} -eq 1 ]]; then declare -i GPU_ID=$((HPCBIND_PARTITION % NUM_GPUS)) export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" else - declare -i MY_TASK_ID=$((HPCBIND_QUEUE_INDEX * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) + declare -i MY_TASK_ID=$((HPCBIND_QUEUE_RANK * HPCBIND_DISTRIBUTE + HPCBIND_PARTITION)) declare -i GPU_ID=$((MY_TASK_ID % NUM_GPUS)) export CUDA_VISIBLE_DEVICES="${HPCBIND_VISIBLE_GPUS[${GPU_ID}]}" fi @@ -451,12 +508,17 @@ fi ################################################################################ # Set hpcbind environment variables ################################################################################ +export HPCBIND_HWLOC_VERSION=${HPCBIND_HWLOC_VERSION} export HPCBIND_HAS_HWLOC=${HPCBIND_HAS_HWLOC} export HPCBIND_HAS_NVIDIA=${HPCBIND_HAS_NVIDIA} export HPCBIND_NUM_PUS=${HPCBIND_NUM_PUS} +export HPCBIND_NUM_CORES=${HPCBIND_NUM_CORES} +export HPCBIND_NUM_NUMAS=${HPCBIND_NUM_NUMAS} +export HPCBIND_NUM_SOCKETS=${HPCBIND_NUM_SOCKETS} export HPCBIND_HWLOC_CPUSET="${HPCBIND_HWLOC_CPUSET}" export HPCBIND_HWLOC_DISTRIBUTE=${HPCBIND_DISTRIBUTE} export HPCBIND_HWLOC_DISTRIBUTE_PARTITION=${HPCBIND_PARTITION} +export HPCBIND_OPENMP_RATIO="${HPCBIND_OPENMP_RATIO_NUMERATOR}/${HPCBIND_OPENMP_RATIO_DENOMINATOR}" if [[ "${HPCBIND_HWLOC_PARENT_CPUSET}" == "" ]]; then export HPCBIND_HWLOC_PARENT_CPUSET="all" else @@ -467,7 +529,8 @@ export HPCBIND_NVIDIA_ENABLE_GPU_MAPPING=${HPCBIND_ENABLE_GPU_MAPPING} export HPCBIND_NVIDIA_VISIBLE_GPUS=$(echo "${HPCBIND_VISIBLE_GPUS[*]}" | tr ' ' ',') export HPCBIND_OPENMP_VERSION="${HPCBIND_OPENMP_VERSION}" if [[ "${HPCBIND_QUEUE_NAME}" != "" ]]; then - export HPCBIND_QUEUE_INDEX=${HPCBIND_QUEUE_INDEX} + export HPCBIND_QUEUE_RANK=${HPCBIND_QUEUE_RANK} + export HPCBIND_QUEUE_SIZE=${HPCBIND_QUEUE_SIZE} export HPCBIND_QUEUE_NAME="${HPCBIND_QUEUE_NAME}" export HPCBIND_QUEUE_MAPPING=${HPCBIND_QUEUE_MAPPING} fi @@ -487,10 +550,16 @@ if [[ ${HPCBIND_TEE} -eq 0 || ${HPCBIND_VERBOSE} -eq 0 ]]; then echo "${TMP_ENV}" | grep -E "^CUDA_" >> ${HPCBIND_LOG} echo "[OPENMP]" >> ${HPCBIND_LOG} echo "${TMP_ENV}" | grep -E "^OMP_" >> ${HPCBIND_LOG} + echo "[GOMP] (gcc, g++, and gfortran)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^GOMP_" >> ${HPCBIND_LOG} + echo "[KMP] (icc, icpc, and ifort)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^KMP_" >> ${HPCBIND_LOG} + echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" >> ${HPCBIND_LOG} + echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" >> ${HPCBIND_LOG} if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then echo "[BINDINGS]" >> ${HPCBIND_LOG} - hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu >> ${HPCBIND_LOG} + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" >> ${HPCBIND_LOG} else echo "Unable to show bindings, hwloc not available." >> ${HPCBIND_LOG} fi @@ -503,10 +572,16 @@ else echo "${TMP_ENV}" | grep -E "^CUDA_" > >(tee -a ${HPCBIND_LOG}) echo "[OPENMP]" > >(tee -a ${HPCBIND_LOG}) echo "${TMP_ENV}" | grep -E "^OMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[GOMP] (gcc, g++, and gfortran)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^GOMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[KMP] (icc, icpc, and ifort)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^KMP_" > >(tee -a ${HPCBIND_LOG}) + echo "[XLSMPOPTS] (xlc, xlc++, and xlf)" > >(tee -a ${HPCBIND_LOG}) + echo "${TMP_ENV}" | grep -E "^XLSMPOPTS" > >(tee -a ${HPCBIND_LOG}) if [[ ${HPCBIND_HAS_HWLOC} -eq 1 ]]; then echo "[BINDINGS]" > >(tee -a ${HPCBIND_LOG}) - hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --only pu > >(tee -a ${HPCBIND_LOG}) + hwloc-ls --restrict "${HPCBIND_HWLOC_CPUSET}" --no-io --no-bridges > >(tee -a ${HPCBIND_LOG}) else echo "Unable to show bindings, hwloc not available." > >(tee -a ${HPCBIND_LOG}) fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 76e33f3c66..d339da4fcd 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -39,6 +39,12 @@ cuda_args="" # Arguments for both NVCC and Host compiler shared_args="" +# Argument -c +compile_arg="" + +# Argument -o +output_arg="" + # Linker arguments xlinker_args="" @@ -66,6 +72,7 @@ dry_run=0 # Skip NVCC compilation and use host compiler directly host_only=0 +host_only_args="" # Enable workaround for CUDA 6.5 for pragma ident replace_pragma_ident=0 @@ -81,6 +88,11 @@ optimization_applied=0 # Check if we have -std=c++X or --std=c++X already stdcxx_applied=0 +# Run nvcc a second time to generate dependencies if needed +depfile_separate=0 +depfile_output_arg="" +depfile_target_arg="" + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -112,12 +124,31 @@ do fi ;; #Handle shared args (valid for both nvcc and the host compiler) - -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) + -D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) shared_args="$shared_args $1" ;; - #Handle shared args that have an argument - -o|-MT) - shared_args="$shared_args $1 $2" + #Handle compilation argument + -c) + compile_arg="$1" + ;; + #Handle output argument + -o) + output_arg="$output_arg $1 $2" + shift + ;; + # Handle depfile arguments. We map them to a separate call to nvcc. + -MD|-MMD) + depfile_separate=1 + host_only_args="$host_only_args $1" + ;; + -MF) + depfile_output_arg="-o $2" + host_only_args="$host_only_args $1 $2" + shift + ;; + -MT) + depfile_target_arg="$1 $2" + host_only_args="$host_only_args $1 $2" shift ;; #Handle known nvcc args @@ -242,7 +273,7 @@ if [ $first_xcompiler_arg -eq 0 ]; then fi #Compose host only command -host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host" +host_command="$host_compiler $shared_args $host_only_args $compile_arg $output_arg $xcompiler_args $host_linker_args $shared_versioned_libraries_host" #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' if [ $replace_pragma_ident -eq 1 ]; then @@ -274,10 +305,21 @@ else host_command="$host_command $object_files" fi +if [ $depfile_separate -eq 1 ]; then + # run nvcc a second time to generate dependencies (without compiling) + nvcc_depfile_command="$nvcc_command -M $depfile_target_arg $depfile_output_arg" +else + nvcc_depfile_command="" +fi + +nvcc_command="$nvcc_command $compile_arg $output_arg" + #Print command for dryrun if [ $dry_run -eq 1 ]; then if [ $host_only -eq 1 ]; then echo $host_command + elif [ -n "$nvcc_depfile_command" ]; then + echo $nvcc_command "&&" $nvcc_depfile_command else echo $nvcc_command fi @@ -287,6 +329,8 @@ fi #Run compilation command if [ $host_only -eq 1 ]; then $host_command +elif [ -n "$nvcc_depfile_command" ]; then + $nvcc_command && $nvcc_depfile_command else $nvcc_command fi diff --git a/lib/kokkos/cmake/Makefile.generate_cmake_settings b/lib/kokkos/cmake/Makefile.generate_cmake_settings new file mode 100644 index 0000000000..da076b23db --- /dev/null +++ b/lib/kokkos/cmake/Makefile.generate_cmake_settings @@ -0,0 +1,8 @@ +ifndef KOKKOS_PATH + MAKEFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) + KOKKOS_PATH = $(subst Makefile,,$(MAKEFILE_PATH)).. +endif + +include $(KOKKOS_PATH)/Makefile.kokkos +include $(KOKKOS_PATH)/core/src/Makefile.generate_header_lists +include $(KOKKOS_PATH)/core/src/Makefile.generate_build_files diff --git a/lib/kokkos/cmake/kokkos.cmake b/lib/kokkos/cmake/kokkos.cmake deleted file mode 100644 index 396822c7fa..0000000000 --- a/lib/kokkos/cmake/kokkos.cmake +++ /dev/null @@ -1,1202 +0,0 @@ - - -# Set which Kokkos backend to use. -set(KOKKOS_ENABLE_CUDA OFF CACHE BOOL "Use Kokkos CUDA backend") -set(KOKKOS_ENABLE_OPENMP ON CACHE BOOL "Use Kokkos OpenMP backend") -set(KOKKOS_ENABLE_PTHREAD OFF CACHE BOOL "Use Kokkos Pthreads backend") -set(KOKKOS_ENABLE_QTHREADS OFF CACHE BOOL "Use Kokkos Qthreads backend") -set(KOKKOS_ENABLE_SERIAL ON CACHE BOOL "Use Kokkos Serial backend") - -# List of possible host architectures. -list(APPEND KOKKOS_HOST_ARCH_LIST - None # No architecture optimization - AMDAVX # AMD chip - ARMv80 # ARMv8.0 Compatible CPU - ARMv81 # ARMv8.1 Compatible CPU - ARMv8-ThunderX # ARMv8 Cavium ThunderX CPU - SNB # Intel Sandy/Ivy Bridge CPUs - HSW # Intel Haswell CPUs - BDW # Intel Broadwell Xeon E-class CPUs - SKX # Intel Sky Lake Xeon E-class HPC CPUs (AVX512) - KNC # Intel Knights Corner Xeon Phi - KNL # Intel Knights Landing Xeon Phi - BGQ # IBM Blue Gene Q - Power7 # IBM POWER7 CPUs - Power8 # IBM POWER8 CPUs - Power9 # IBM POWER9 CPUs - ) - -# Setting this variable to a value other than "None" can improve host -# performance by turning on architecture specific code. -set(KOKKOS_HOST_ARCH "None" CACHE STRING "Optimize for specific host architecture.") -set_property(CACHE KOKKOS_HOST_ARCH PROPERTY STRINGS ${KOKKOS_HOST_ARCH_LIST}) - -# List of possible GPU architectures. -list(APPEND KOKKOS_GPU_ARCH_LIST - None # No architecture optimization - Kepler # NVIDIA Kepler default (generation CC 3.5) - Kepler30 # NVIDIA Kepler generation CC 3.0 - Kepler32 # NVIDIA Kepler generation CC 3.2 - Kepler35 # NVIDIA Kepler generation CC 3.5 - Kepler37 # NVIDIA Kepler generation CC 3.7 - Maxwell # NVIDIA Maxwell default (generation CC 5.0) - Maxwell50 # NVIDIA Maxwell generation CC 5.0 - Maxwell52 # NVIDIA Maxwell generation CC 5.2 - Maxwell53 # NVIDIA Maxwell generation CC 5.3 - Pascal60 # NVIDIA Pascal generation CC 6.0 - Pascal61 # NVIDIA Pascal generation CC 6.1 - ) - -# Setting this variable to a value other than "None" can improve GPU -# performance by turning on architecture specific code. -set(KOKKOS_GPU_ARCH "None" CACHE STRING "Optimize for specific GPU architecture.") -set_property(CACHE KOKKOS_GPU_ARCH PROPERTY STRINGS ${KOKKOS_GPU_ARCH_LIST}) - -set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos. ON = kokkoscore, kokkoscontainers, and kokkosalgorithms.") - -# Enable hwloc library. -set(KOKKOS_ENABLE_HWLOC OFF CACHE BOOL "Enable hwloc for better process placement.") -set(KOKKOS_HWLOC_DIR "" CACHE PATH "Location of hwloc library.") - -# Enable memkind library. -set(KOKKOS_ENABLE_MEMKIND OFF CACHE BOOL "Enable memkind.") -set(KOKKOS_MEMKIND_DIR "" CACHE PATH "Location of memkind library.") - -set(KOKKOS_ENABLE_LIBRT OFF CACHE BOOL "Enable librt for more precise timer.") - -# Enable debugging. -set(KOKKOS_DEBUG OFF CACHE BOOL "Enable debugging in Kokkos.") - -# Enable profiling. -set(KOKKOS_ENABLE_PROFILING ON CACHE BOOL "Enable profiling.") - -# Enable aggressive vectorization. -set(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION OFF CACHE BOOL "Enable aggressive vectorization.") - -# Qthreads options. -set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.") - -# CUDA options. -set(KOKKOS_CUDA_DIR "" CACHE PATH "Location of CUDA library. Defaults to where nvcc installed.") -set(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC OFF CACHE BOOL "Enable CUDA LDG.") -set(KOKKOS_ENABLE_CUDA_UVM OFF CACHE BOOL "Enable CUDA unified virtual memory.") -set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OFF CACHE BOOL "Enable relocatable device code for CUDA.") -set(KOKKOS_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Enable lambdas for CUDA.") - -################################### FUNCTIONS ################################## - -# Sets the following compiler variables that are analogous to the CMAKE_* -# versions. We add the ability to detect NVCC (really nvcc_wrapper). -# KOKKOS_CXX_COMPILER -# KOKKOS_CXX_COMPILER_ID -# KOKKOS_CXX_COMPILER_VERSION -# -# Also verifies the compiler version meets the minimum required by Kokkos. -function(set_kokkos_cxx_compiler) - # Since CMake doesn't recognize the nvcc compiler until 3.8, we use our own - # version of the CMake variables and detect nvcc ourselves. Initially set to - # the CMake variable values. - set(INTERNAL_CXX_COMPILER ${CMAKE_CXX_COMPILER}) - set(INTERNAL_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) - set(INTERNAL_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) - - # Check if the compiler is nvcc (which really means nvcc_wrapper). - execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version - COMMAND grep nvcc - COMMAND wc -l - OUTPUT_VARIABLE INTERNAL_HAVE_COMPILER_NVCC - OUTPUT_STRIP_TRAILING_WHITESPACE) - - string(REGEX REPLACE "^ +" "" - INTERNAL_HAVE_COMPILER_NVCC ${INTERNAL_HAVE_COMPILER_NVCC}) - - if(INTERNAL_HAVE_COMPILER_NVCC) - # Set the compiler id to nvcc. We use the value used by CMake 3.8. - set(INTERNAL_CXX_COMPILER_ID NVIDIA) - - # Set nvcc's compiler version. - execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version - COMMAND grep release - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - - string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$" - INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - endif() - - # Enforce the minimum compilers supported by Kokkos. - set(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 3.5.2 or higher") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 4.7.2 or higher") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 14.0.4 or higher") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 7.0.28 or higher") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.1 or higher\n") - - if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 3.5.2) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() - elseif(INTERNAL_CXX_COMPILER_ID STREQUAL GNU) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.7.2) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() - elseif(INTERNAL_CXX_COMPILER_ID STREQUAL Intel) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 14.0.4) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() - elseif(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 7.0.28) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() - elseif(INTERNAL_CXX_COMPILER_ID STREQUAL PGI) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 17.1) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - endif() - endif() - - # Enforce that extensions are turned off for nvcc_wrapper. - if(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) - if(NOT DEFINED CMAKE_CXX_EXTENSIONS OR CMAKE_CXX_EXTENSIONS STREQUAL ON) - message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set CMAKE_CXX_EXTENSIONS to OFF in your CMakeLists.txt.") - endif() - endif() - - if(KOKKOS_ENABLE_CUDA) - # Enforce that the compiler can compile CUDA code. - if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang) - if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - endif() - elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) - message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang.") - endif() - endif() - - set(KOKKOS_CXX_COMPILER ${INTERNAL_CXX_COMPILER} PARENT_SCOPE) - set(KOKKOS_CXX_COMPILER_ID ${INTERNAL_CXX_COMPILER_ID} PARENT_SCOPE) - set(KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION} PARENT_SCOPE) -endfunction() - -# Transitively enforces that the appropriate CXX standard compile flags (C++11 -# or above) are added to targets that use the Kokkos library. Compile features -# are used if possible. Otherwise, the appropriate flags are added to -# KOKKOS_CXX_FLAGS. Values set by the user to CMAKE_CXX_STANDARD and -# CMAKE_CXX_EXTENSIONS are honored. -function(set_kokkos_compiler_standard) - # The following table lists the versions of CMake that supports CXX_STANDARD - # and the CXX compile features for different compilers. The versions are - # based on CMake documentation, looking at CMake code, and verifying by - # testing with specific CMake versions. - # - # COMPILER CXX_STANDARD Compile Features - # --------------------------------------------------------------- - # Clang 3.1 3.1 - # GNU 3.1 3.2 - # AppleClang 3.2 3.2 - # Intel 3.6 3.6 - # Cray No No - # PGI No No - # XL No No - # - # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's - # flags for turning on C++11. Since for compiler ID and versioning purposes - # CMake recognizes the host compiler when calling nvcc_wrapper, this just - # works. Both NVCC and nvcc_wrapper only recognize '-std=c++11' which means - # that we can only use host compilers for CUDA builds that use those flags. - # It also means that extensions (gnu++11) can't be turned on for CUDA builds. - - # Check if we can use compile features. - if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - if(CMAKE_CXX_COMPILER_ID STREQUAL Clang) - if(NOT CMAKE_VERSION VERSION_LESS 3.1) - set(INTERNAL_USE_COMPILE_FEATURES ON) - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang OR CMAKE_CXX_COMPILER_ID STREQUAL GNU) - if(NOT CMAKE_VERSION VERSION_LESS 3.2) - set(INTERNAL_USE_COMPILE_FEATURES ON) - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) - if(NOT CMAKE_VERSION VERSION_LESS 3.6) - set(INTERNAL_USE_COMPILE_FEATURES ON) - endif() - endif() - endif() - - if(INTERNAL_USE_COMPILE_FEATURES) - # Use the compile features aspect of CMake to transitively cause C++ flags - # to populate to user code. - - # I'm using a hack by requiring features that I know force the lowest version - # of the compilers we want to support. Clang 3.3 and later support all of - # the C++11 standard. With CMake 3.8 and higher, we could switch to using - # cxx_std_11. - set(KOKKOS_CXX11_FEATURES - cxx_nonstatic_member_init # Forces GCC 4.7 or later and Intel 14.0 or later. - PARENT_SCOPE - ) - else() - # CXX compile features are not yet implemented for this combination of - # compiler and version of CMake. - - if(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) - # Versions of CMAKE before 3.2 don't support CXX_STANDARD or C++ compile - # features for the AppleClang compiler. Set compiler flags transitively - # here such that they trickle down to a call to target_compile_options(). - - # The following two blocks of code were copied from - # /Modules/Compiler/AppleClang-CXX.cmake from CMake 3.7.2 and then - # modified. - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) - # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro - # greater than 201103L. - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) - # Versions of CMAKE before 3.6 don't support CXX_STANDARD or C++ compile - # features for the Intel compiler. Set compiler flags transitively here - # such that they trickle down to a call to target_compile_options(). - - # The following three blocks of code were copied from - # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - set(_std -Qstd) - set(_ext c++) - else() - set(_std -std) - set(_ext gnu++) - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++14") - # TODO: There is no gnu++14 value supported; figure out what to do. - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++14") - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++1y") - # TODO: There is no gnu++14 value supported; figure out what to do. - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++1y") - endif() - - if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++11") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}11") - elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++0x") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}0x") - endif() - elseif(CMAKE_CXX_COMPILER_ID STREQUAL Cray) - # CMAKE doesn't support CXX_STANDARD or C++ compile features for the Cray - # compiler. Set compiler options transitively here such that they trickle - # down to a call to target_compile_options(). - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-hstd=c++11") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-hstd=c++11") - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-hstd=c++11") - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-hstd=c++11") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL PGI) - # CMAKE doesn't support CXX_STANDARD or C++ compile features for the PGI - # compiler. Set compiler options transitively here such that they trickle - # down to a call to target_compile_options(). - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "--c++11") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "--c++11") - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "--c++11") - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "--c++11") - elseif(CMAKE_CXX_COMPILER_ID STREQUAL XL) - # CMAKE doesn't support CXX_STANDARD or C++ compile features for the XL - # compiler. Set compiler options transitively here such that they trickle - # down to a call to target_compile_options(). - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") - set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++11") - set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=c++11") - else() - # Assume GNU. CMAKE_CXX_STANDARD is handled correctly by CMake 3.1 and - # above for this compiler. If the user explicitly requests a C++ - # standard, CMake takes care of it. If not, transitively require C++11. - if(NOT CMAKE_CXX_STANDARD) - set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION ${CMAKE_CXX11_STANDARD_COMPILE_OPTION}) - set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION ${CMAKE_CXX11_EXTENSION_COMPILE_OPTION}) - endif() - endif() - - # Set the C++ standard info for Kokkos respecting user set values for - # CMAKE_CXX_STANDARD and CMAKE_CXX_EXTENSIONS. - if(CMAKE_CXX_STANDARD EQUAL 14) - if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL OFF) - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_STANDARD_COMPILE_OPTION}) - else() - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_EXTENSION_COMPILE_OPTION}) - endif() - elseif(CMAKE_CXX_STANDARD EQUAL 11) - if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL OFF) - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION}) - else() - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION}) - endif() - else() - # The user didn't explicitly request a standard, transitively require - # C++11 respecting CMAKE_CXX_EXTENSIONS. - if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL OFF) - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION}) - else() - set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION}) - endif() - endif() - - set(KOKKOS_CXX_FLAGS ${INTERNAL_CXX_FLAGS} PARENT_SCOPE) - endif() -endfunction() - -########################## COMPILER AND FEATURE CHECKS ######################### - -# TODO: We are assuming that nvcc_wrapper is using g++ as the host compiler. -# Should we allow the user the option to change this? The host compiler -# for nvcc_wrapper can be set via the NVCC_WRAPPER_DEFAULT_COMPILER -# environment variable or by passing a different host compiler with the -# -ccbin flag. - -# TODO: Fully add CUDA support for Clang. -set_kokkos_cxx_compiler() - -set_kokkos_compiler_standard() - -######################### INITIALIZE INTERNAL VARIABLES ######################## - -# Add Kokkos' modules to CMake's module path. -set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - -# Start with all global variables set to false. This guarantees correct -# results with changes and multiple configures. -set(KOKKOS_HAVE_CUDA OFF CACHE INTERNAL "") -set(KOKKOS_USE_CUDA_UVM OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_CUDA_RDC OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_CUDA_LAMBDA OFF CACHE INTERNAL "") -set(KOKKOS_CUDA_CLANG_WORKAROUND OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_OPENMP OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_PTHREAD OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_QTHREADS OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_SERIAL OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_HWLOC OFF CACHE INTERNAL "") -set(KOKKOS_ENABLE_HBWSPACE OFF CACHE INTERNAL "") -set(KOKKOS_HAVE_DEBUG OFF CACHE INTERNAL "") -set(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK OFF CACHE INTERNAL "") -set(KOKKOS_ENABLE_ISA_X86_64 OFF CACHE INTERNAL "") -set(KOKKOS_ENABLE_ISA_KNC OFF CACHE INTERNAL "") -set(KOKKOS_ENABLE_ISA_POWERPCLE OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_ARMV80 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_ARMV81 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_ARMV8_THUNDERX OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_AVX OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_AVX2 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_AVX512MIC OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_AVX512XEON OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KNC OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_POWER8 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_POWER9 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KEPLER OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KEPLER30 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KEPLER32 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KEPLER35 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_KEPLER37 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_MAXWELL OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_MAXWELL50 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_MAXWELL52 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_MAXWELL53 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_PASCAL OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_PASCAL60 OFF CACHE INTERNAL "") -set(KOKKOS_ARCH_PASCAL61 OFF CACHE INTERNAL "") - -############################## SET BACKEND OPTIONS ############################# - -# Make sure at least one backend is selected. -if(NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMP AND NOT KOKKOS_ENABLE_PTHREAD AND NOT KOKKOS_ENABLE_QTHREADS AND NOT KOKKOS_ENABLE_SERIAL) - message(FATAL_ERROR "Must set one of KOKKOS_ENABLE_CUDA, KOKKOS_ENABLE_OPENMP, KOKKOS_ENABLE_PTHREAD, KOKKOS_ENABLE_QTHREADS, or KOKKOS_ENABLE_SERIAL") -endif() - -# Only one of OpenMP, Pthreads, and Qthreads can be set. -set(KOKKOS_MESSAGE_TEXT "Only one of KOKKOS_ENABLE_OPENMP, KOKKOS_ENABLE_PTHREAD, and KOKKOS_ENABLE_QTHREADS can be selected") -if(KOKKOS_ENABLE_OPENMP AND KOKKOS_ENABLE_PTHREAD) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -elseif(KOKKOS_ENABLE_OPENMP AND KOKKOS_ENABLE_QTHREADS) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -elseif(KOKKOS_ENABLE_PTHREAD AND KOKKOS_ENABLE_QTHREADS) - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -endif() - -# Get source files. -file(GLOB KOKKOS_CORE_SRCS core/src/impl/*.cpp) -file(GLOB KOKKOS_CONTAINERS_SRCS containers/src/impl/*.cpp) - -# Set options if using CUDA backend. -if(KOKKOS_ENABLE_CUDA) - if(KOKKOS_CUDA_DIR) - set(CUDA_TOOLKIT_ROOT_DIR ${KOKKOS_CUDA_DIR}) - endif() - - find_package(CUDA) - - if(NOT CUDA_FOUND) - if(KOKKOS_CUDA_DIR) - message(FATAL_ERROR "Couldn't find CUDA in default locations, and KOKKOS_CUDA_DIR points to an invalid installation.") - else() - message(FATAL_ERROR "Couldn't find CUDA in default locations. Set KOKKOS_CUDA_DIR.") - endif() - endif() - - list(APPEND KOKKOS_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) - list(APPEND KOKKOS_LD_FLAGS -L${CUDA_TOOLKIT_ROOT_DIR}/lib64) - list(APPEND KOKKOS_LIBS cudart cuda) - - set(KOKKOS_HAVE_CUDA ON CACHE INTERNAL "") - file(GLOB KOKKOS_CUDA_SRCS core/src/Cuda/*.cpp) - list(APPEND KOKKOS_CORE_SRCS ${KOKKOS_CUDA_SRCS}) - - # Set CUDA UVM if requested. - if(KOKKOS_ENABLE_CUDA_UVM) - set(KOKKOS_USE_CUDA_UVM ON CACHE INTERNAL "") - endif() - - # Set CUDA relocatable device code if requested. - if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - set(KOKKOS_HAVE_CUDA_RDC ON CACHE INTERNAL "") - list(APPEND KOKKOS_CXX_FLAGS --relocatable-device-code=true) - list(APPEND KOKKOS_LD_FLAGS --relocatable-device-code=true) - endif() - - # Set CUDA lambda if requested. - if(KOKKOS_ENABLE_CUDA_LAMBDA) - set(KOKKOS_HAVE_CUDA_LAMBDA ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7.5) - message(FATAL_ERROR "CUDA lambda support requires CUDA 7.5 or higher. Disable it or use a 7.5 or later compiler.") - else() - list(APPEND KOKKOS_CXX_FLAGS -expt-extended-lambda) - endif() - endif() - endif() - - # Set Clang specific options. - if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - list(APPEND KOKKOS_CXX_FLAGS --cuda-path=${CUDA_TOOLKIT_ROOT_DIR}) - - set(KOKKOS_CUDA_CLANG_WORKAROUND ON CACHE INTERNAL "") - - # Force CUDA_LDG_INTRINSIC on when using Clang. - set(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC ON CACHE BOOL "Enable CUDA LDG." FORCE) - endif() -endif() - -# Set options if using OpenMP backend. -if(KOKKOS_ENABLE_OPENMP) - find_package(OpenMP REQUIRED) - - if(OPENMP_FOUND) - if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - list(APPEND KOKKOS_CXX_FLAGS -Xcompiler) - endif() - - list(APPEND KOKKOS_CXX_FLAGS ${OpenMP_CXX_FLAGS}) - list(APPEND KOKKOS_LD_FLAGS ${OpenMP_CXX_FLAGS}) - endif() - - set(KOKKOS_HAVE_OPENMP ON CACHE INTERNAL "") - file(GLOB KOKKOS_OPENMP_SRCS core/src/OpenMP/*.cpp) - list(APPEND KOKKOS_CORE_SRCS ${KOKKOS_OPENMP_SRCS}) -endif() - -# Set options if using Pthreads backend. -if(KOKKOS_ENABLE_PTHREAD) - find_package(Threads REQUIRED) - - list(APPEND KOKKOS_LIBS Threads::Threads) - - set(KOKKOS_HAVE_PTHREAD ON CACHE INTERNAL "") - file(GLOB KOKKOS_PTHREAD_SRCS core/src/Threads/*.cpp) - list(APPEND KOKKOS_CORE_SRCS ${KOKKOS_PTHREAD_SRCS}) -endif() - -# Set options if using Qthreads backend. -if(KOKKOS_ENABLE_QTHREADS) - if(KOKKOS_QTHREADS_DIR) - list(APPEND CMAKE_PREFIX_PATH ${KOKKOS_QTHREADS_DIR}) - endif() - - find_package(Qthreads) - - if(NOT QTHREADS_FOUND) - if(KOKKOS_QTHREADS_DIR) - message(FATAL_ERROR "Couldn't find Qthreads in default locations, and KOKKOS_QTHREADS_DIR points to an invalid installation.") - else() - message(FATAL_ERROR "Couldn't find Qthreads in default locations. Set KOKKOS_QTHREADS_DIR.") - endif() - endif() - - list(APPEND KOKKOS_INCLUDE_DIRS ${QTHREADS_INCLUDE_DIR}) - list(APPEND KOKKOS_LIBS ${QTHREADS_LIBRARIES}) - - set(KOKKOS_HAVE_QTHREADS ON CACHE INTERNAL "") - file(GLOB KOKKOS_QTHREADS_SRCS core/src/Threads/*.cpp) - list(APPEND KOKKOS_CORE_SRCS ${KOKKOS_QTHREADS_SRCS}) - - if(KOKKOS_QTHREADS_DIR) - list(REMOVE_AT CMAKE_PREFIX_PATH -1) - endif() -endif() - -# Set options if using Serial backend. -if(KOKKOS_ENABLE_SERIAL) - set(KOKKOS_HAVE_SERIAL ON CACHE INTERNAL "") -else() - # Remove serial source files. - list(REMOVE_ITEM KOKKOS_CORE_SRCS - "${Kokkos_SOURCE_DIR}/core/src/impl/Kokkos_Serial.cpp" - "${Kokkos_SOURCE_DIR}/core/src/impl/Kokkos_Serial_Task.cpp") -endif() - -########################### SET ARCHITECTURE OPTIONS ########################### - -# Make sure the host architecture option is valid. Need to verify in case user -# passes the option via the command line. -list(FIND KOKKOS_HOST_ARCH_LIST "${KOKKOS_HOST_ARCH}" KOKKOS_VALID_HOST_ARCH) -if(KOKKOS_VALID_HOST_ARCH EQUAL -1) - set(KOKKOS_ARCH_TEXT "\n ${KOKKOS_HOST_ARCH_LIST}") - string(REPLACE ";" "\n " KOKKOS_ARCH_TEXT "${KOKKOS_ARCH_TEXT}") - set(KOKKOS_MESSAGE_TEXT "Invalid architecture for KOKKOS_HOST_ARCH: '${KOKKOS_HOST_ARCH}'") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Choices:${KOKKOS_ARCH_TEXT}\n") - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -endif() - -# Make sure the GPU architecture option is valid. Need to verify in case user -# passes the option via the command line. -list(FIND KOKKOS_GPU_ARCH_LIST "${KOKKOS_GPU_ARCH}" KOKKOS_VALID_GPU_ARCH) -if(KOKKOS_VALID_GPU_ARCH EQUAL -1) - set(KOKKOS_ARCH_TEXT "\n ${KOKKOS_GPU_ARCH_LIST}") - string(REPLACE ";" "\n " KOKKOS_ARCH_TEXT "${KOKKOS_ARCH_TEXT}") - set(KOKKOS_MESSAGE_TEXT "Invalid architecture for KOKKOS_GPU_ARCH: '${KOKKOS_GPU_ARCH}'") - set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Choices:${KOKKOS_ARCH_TEXT}\n") - message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -endif() - -# Decide what ISA level we are able to support. -if(KOKKOS_HOST_ARCH STREQUAL SNB OR KOKKOS_HOST_ARCH STREQUAL HSW OR KOKKOS_HOST_ARCH STREQUAL BDW OR - KOKKOS_HOST_ARCH STREQUAL SKX OR KOKKOS_HOST_ARCH STREQUAL KNL) - set(KOKKOS_ENABLE_ISA_X86_64 ON CACHE INTERNAL "") -endif() - -if(KOKKOS_HOST_ARCH STREQUAL KNC) - set(KOKKOS_ENABLE_ISA_KNC ON CACHE INTERNAL "") -endif() - -if(KOKKOS_HOST_ARCH STREQUAL Power8 OR KOKKOS_HOST_ARCH STREQUAL Power9) - set(KOKKOS_ENABLE_ISA_POWERPCLE ON CACHE INTERNAL "") -endif() - -# Add host architecture options. -if(KOKKOS_HOST_ARCH STREQUAL ARMv80) - set(KOKKOS_ARCH_ARMV80 ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=armv8-a) - list(APPEND KOKKOS_LD_FLAGS -march=armv8-a) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL ARMv81) - set(KOKKOS_ARCH_ARMV81 ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=armv8.1-a) - list(APPEND KOKKOS_LD_FLAGS -march=armv8.1-a) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL ARMv8-ThunderX) - set(KOKKOS_ARCH_ARMV80 ON CACHE INTERNAL "") - set(KOKKOS_ARCH_ARMV8_THUNDERX ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=armv8-a -mtune=thunderx) - list(APPEND KOKKOS_LD_FLAGS -march=armv8-a -mtune=thunderx) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL SNB OR KOKKOS_HOST_ARCH STREQUAL AMDAVX) - set(KOKKOS_ARCH_AVX ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - list(APPEND KOKKOS_CXX_FLAGS -mavx) - list(APPEND KOKKOS_LD_FLAGS -mavx) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - list(APPEND KOKKOS_CXX_FLAGS -tp=sandybridge) - list(APPEND KOKKOS_LD_FLAGS -tp=sandybridge) - else() - list(APPEND KOKKOS_CXX_FLAGS -mavx) - list(APPEND KOKKOS_LD_FLAGS -mavx) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL HSW OR KOKKOS_HOST_ARCH STREQUAL BDW) - set(KOKKOS_ARCH_AVX2 ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - list(APPEND KOKKOS_CXX_FLAGS -xCORE-AVX2) - list(APPEND KOKKOS_LD_FLAGS -xCORE-AVX2) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - list(APPEND KOKKOS_CXX_FLAGS -tp=haswell) - list(APPEND KOKKOS_LD_FLAGS -tp=haswell) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=core-avx2 -mtune=core-avx2) - list(APPEND KOKKOS_LD_FLAGS -march=core-avx2 -mtune=core-avx2) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL KNL) - set(KOKKOS_ARCH_AVX512MIC ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - list(APPEND KOKKOS_CXX_FLAGS -xMIC-AVX512) - list(APPEND KOKKOS_LD_FLAGS -xMIC-AVX512) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=knl) - list(APPEND KOKKOS_LD_FLAGS -march=knl) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL SKX) - set(KOKKOS_ARCH_AVX512XEON ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - list(APPEND KOKKOS_CXX_FLAGS -xCORE-AVX512) - list(APPEND KOKKOS_LD_FLAGS -xCORE-AVX512) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -march=skylake-avx512) - list(APPEND KOKKOS_LD_FLAGS -march=skylake-avx512) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL KNC) - set(KOKKOS_ARCH_KNC ON CACHE INTERNAL "") - list(APPEND KOKKOS_CXX_FLAGS -mmic) - list(APPEND KOKKOS_LD_FLAGS -mmic) -elseif(KOKKOS_HOST_ARCH STREQUAL Power8) - set(KOKKOS_ARCH_POWER8 ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -mcpu=power8 -mtune=power8) - list(APPEND KOKKOS_LD_FLAGS -mcpu=power8 -mtune=power8) - endif() -elseif(KOKKOS_HOST_ARCH STREQUAL Power9) - set(KOKKOS_ARCH_POWER9 ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COMPILER_ID STREQUAL PGI) - else() - list(APPEND KOKKOS_CXX_FLAGS -mcpu=power9 -mtune=power9) - list(APPEND KOKKOS_LD_FLAGS -mcpu=power9 -mtune=power9) - endif() -endif() - -# Add GPU architecture options. -if(KOKKOS_ENABLE_CUDA) - if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - set(KOKKOS_GPU_ARCH_FLAG -arch) - elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - list(APPEND KOKKOS_CXX_FLAGS -x cuda) - set(KOKKOS_GPU_ARCH_FLAG --cuda-gpu-arch) - endif() - - if(KOKKOS_GPU_ARCH STREQUAL Kepler30) - set(KOKKOS_ARCH_KEPLER ON CACHE INTERNAL "") - set(KOKKOS_ARCH_KEPLER30 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_30) - elseif(KOKKOS_GPU_ARCH STREQUAL Kepler32) - set(KOKKOS_ARCH_KEPLER ON CACHE INTERNAL "") - set(KOKKOS_ARCH_KEPLER32 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_32) - elseif(KOKKOS_GPU_ARCH STREQUAL Kepler35 OR KOKKOS_GPU_ARCH STREQUAL Kepler) - set(KOKKOS_ARCH_KEPLER ON CACHE INTERNAL "") - set(KOKKOS_ARCH_KEPLER35 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_35) - elseif(KOKKOS_GPU_ARCH STREQUAL Kepler37) - set(KOKKOS_ARCH_KEPLER ON CACHE INTERNAL "") - set(KOKKOS_ARCH_KEPLER37 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_37) - elseif(KOKKOS_GPU_ARCH STREQUAL Maxwell50 OR KOKKOS_GPU_ARCH STREQUAL Maxwell) - set(KOKKOS_ARCH_MAXWELL ON CACHE INTERNAL "") - set(KOKKOS_ARCH_MAXWELL50 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_50) - elseif(KOKKOS_GPU_ARCH STREQUAL Maxwell52) - set(KOKKOS_ARCH_MAXWELL ON CACHE INTERNAL "") - set(KOKKOS_ARCH_MAXWELL52 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_52) - elseif(KOKKOS_GPU_ARCH STREQUAL Maxwell53) - set(KOKKOS_ARCH_MAXWELL ON CACHE INTERNAL "") - set(KOKKOS_ARCH_MAXWELL53 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_53) - elseif(KOKKOS_GPU_ARCH STREQUAL Pascal60) - set(KOKKOS_ARCH_PASCAL ON CACHE INTERNAL "") - set(KOKKOS_ARCH_PASCAL60 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_60) - elseif(KOKKOS_GPU_ARCH STREQUAL Pascal61) - set(KOKKOS_ARCH_PASCAL ON CACHE INTERNAL "") - set(KOKKOS_ARCH_PASCAL61 ON CACHE INTERNAL "") - set(KOKKOS_GPU_ARCH_FLAG ${KOKKOS_GPU_ARCH_FLAG}=sm_61) - endif() - - if(NOT KOKKOS_GPU_ARCH STREQUAL None) - list(APPEND KOKKOS_CXX_FLAGS ${KOKKOS_GPU_ARCH_FLAG}) - - if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - list(APPEND KOKKOS_LD_FLAGS ${KOKKOS_GPU_ARCH_FLAG}) - endif() - endif() -endif() - -############################### SET OTHER OPTIONS ############################## - -# Set options if using hwloc. -if(KOKKOS_ENABLE_HWLOC) - if(KOKKOS_HWLOC_DIR) - list(APPEND CMAKE_PREFIX_PATH ${KOKKOS_HWLOC_DIR}) - endif() - - find_package(HWLOC) - - if(NOT HWLOC_FOUND) - if(KOKKOS_HWLOC_DIR) - message(FATAL_ERROR "Couldn't find HWLOC in default locations, and KOKKOS_HWLOC_DIR points to an invalid installation.") - else() - message(FATAL_ERROR "Couldn't find HWLOC in default locations. Set KOKKOS_HWLOC_DIR.") - endif() - endif() - - list(APPEND KOKKOS_INCLUDE_DIRS ${HWLOC_INCLUDE_DIR}) - list(APPEND KOKKOS_LIBS ${HWLOC_LIBRARIES}) - - set(KOKKOS_HAVE_HWLOC ON CACHE INTERNAL "") - - if(KOKKOS_HWLOC_DIR) - list(REMOVE_AT CMAKE_PREFIX_PATH -1) - endif() -endif() - -# Set options if using memkind. -if(KOKKOS_ENABLE_MEMKIND) - if(KOKKOS_MEMKIND_DIR) - list(APPEND CMAKE_PREFIX_PATH ${KOKKOS_MEMKIND_DIR}) - endif() - - find_package(Memkind) - - if(NOT MEMKIND_FOUND) - if(KOKKOS_MEMKIND_DIR) - message(FATAL_ERROR "Couldn't find Memkind in default locations, and KOKKOS_MEMKIND_DIR points to an invalid installation.") - else() - message(FATAL_ERROR "Couldn't find Memkind in default locations. Set KOKKOS_MEMKIND_DIR.") - endif() - endif() - - set(KOKKOS_ENABLE_HBWSPACE ON CACHE INTERNAL "") - list(APPEND KOKKOS_INCLUDE_DIRS ${MEMKIND_INCLUDE_DIR}) - list(APPEND KOKKOS_LIBS ${MEMKIND_LIBRARIES}) - - if(KOKKOS_MEMKIND_DIR) - list(REMOVE_AT CMAKE_PREFIX_PATH -1) - endif() -else() - # Remove HBW source file. - list(REMOVE_ITEM KOKKOS_CORE_SRCS - "${Kokkos_SOURCE_DIR}/core/src/impl/Kokkos_HBWSpace.cpp") -endif() - -# Set options if using librt. -if(KOKKOS_ENABLE_LIBRT) - list(APPEND KOKKOS_LIBS rt) -endif() - -# Set debugging if requested. -if(KOKKOS_DEBUG) - set(KOKKOS_HAVE_DEBUG ON CACHE INTERNAL "") - set(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ON CACHE INTERNAL "") - - if(KOKKOS_CXX_COVIDIA) - list(APPEND KOKKOS_CXX_FLAGS -lineinfo) - endif() - - list(APPEND KOKKOS_CXX_FLAGS -g) - list(APPEND KOKKOS_LD_FLAGS -g) -endif() - -# Set profiling if requested. -if(KOKKOS_ENABLE_PROFILING) - list(APPEND KOKKOS_LIBS dl) -else() - # Remove profiling source file. - list(REMOVE_ITEM KOKKOS_CORE_SRCS - "${Kokkos_SOURCE_DIR}/core/src/impl/Kokkos_Profiling_Interface.cpp") -endif() - -# Use GCC toolchain with Clang. -if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT APPLE) - find_program(KOKKOS_GCC_PATH g++) - if(NOT KOKKOS_GCC_PATH) - message(FATAL_ERROR "Can't find GCC path to get toolchain for Clang.") - endif() - string(REPLACE "/bin/g++" "" KOKKOS_GCC_PATH ${KOKKOS_GCC_PATH}) - - list(APPEND KOKKOS_CXX_FLAGS --gcc-toolchain=${KOKKOS_GCC_PATH}) - list(APPEND KOKKOS_LD_FLAGS --gcc-toolchain=${KOKKOS_GCC_PATH}) -endif() - -############################ Detect if submodule ############################### -# -# With thanks to StackOverflow: -# http://stackoverflow.com/questions/25199677/how-to-detect-if-current-scope-has-a-parent-in-cmake -# -get_directory_property(HAS_PARENT PARENT_DIRECTORY) -if(HAS_PARENT) - message(STATUS "Submodule build") - SET(KOKKOS_HEADER_DIR "include/kokkos") -else() - message(STATUS "Standalone build") - SET(KOKKOS_HEADER_DIR "include") -endif() - -############################ PRINT CONFIGURE STATUS ############################ - -message(STATUS "") -message(STATUS "****************** Kokkos Settings ******************") -message(STATUS "Execution Spaces") - -if(KOKKOS_ENABLE_CUDA) - message(STATUS " Device Parallel: Cuda") -else() - message(STATUS " Device Parallel: None") -endif() - -if(KOKKOS_ENABLE_OPENMP) - message(STATUS " Host Parallel: OpenMP") -elseif(KOKKOS_ENABLE_PTHREAD) - message(STATUS " Host Parallel: Pthread") -elseif(KOKKOS_ENABLE_QTHREADS) - message(STATUS " Host Parallel: Qthreads") -else() - message(STATUS " Host Parallel: None") -endif() - -if(KOKKOS_ENABLE_SERIAL) - message(STATUS " Host Serial: Serial") -else() - message(STATUS " Host Serial: None") -endif() - -message(STATUS "") -message(STATUS "Architectures") -message(STATUS " Host Architecture: ${KOKKOS_HOST_ARCH}") -message(STATUS " Device Architecture: ${KOKKOS_GPU_ARCH}") - -message(STATUS "") -message(STATUS "Enabled options") - -if(KOKKOS_SEPARATE_LIBS) - message(STATUS " KOKKOS_SEPARATE_LIBS") -endif() - -if(KOKKOS_ENABLE_HWLOC) - message(STATUS " KOKKOS_ENABLE_HWLOC") -endif() - -if(KOKKOS_ENABLE_MEMKIND) - message(STATUS " KOKKOS_ENABLE_MEMKIND") -endif() - -if(KOKKOS_DEBUG) - message(STATUS " KOKKOS_DEBUG") -endif() - -if(KOKKOS_ENABLE_PROFILING) - message(STATUS " KOKKOS_ENABLE_PROFILING") -endif() - -if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - message(STATUS " KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION") -endif() - -if(KOKKOS_ENABLE_CUDA) - if(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) - message(STATUS " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC") - endif() - - if(KOKKOS_ENABLE_CUDA_UVM) - message(STATUS " KOKKOS_ENABLE_CUDA_UVM") - endif() - - if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - message(STATUS " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE") - endif() - - if(KOKKOS_ENABLE_CUDA_LAMBDA) - message(STATUS " KOKKOS_ENABLE_CUDA_LAMBDA") - endif() - - if(KOKKOS_CUDA_DIR) - message(STATUS " KOKKOS_CUDA_DIR: ${KOKKOS_CUDA_DIR}") - endif() -endif() - -if(KOKKOS_QTHREADS_DIR) - message(STATUS " KOKKOS_QTHREADS_DIR: ${KOKKOS_QTHREADS_DIR}") -endif() - -if(KOKKOS_HWLOC_DIR) - message(STATUS " KOKKOS_HWLOC_DIR: ${KOKKOS_HWLOC_DIR}") -endif() - -if(KOKKOS_MEMKIND_DIR) - message(STATUS " KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}") -endif() - -message(STATUS "*****************************************************") -message(STATUS "") - -################################ SET UP PROJECT ################################ - -configure_file( - ${Kokkos_SOURCE_DIR}/core/cmake/KokkosCore_config.h.in - ${Kokkos_BINARY_DIR}/KokkosCore_config.h -) - -SET(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries") -SET(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables") -SET(INSTALL_INCLUDE_DIR ${KOKKOS_HEADER_DIR} CACHE PATH - "Installation directory for header files") -IF(WIN32 AND NOT CYGWIN) - SET(DEF_INSTALL_CMAKE_DIR CMake) -ELSE() - SET(DEF_INSTALL_CMAKE_DIR lib/CMake/Kokkos) -ENDIF() - -SET(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH - "Installation directory for CMake files") - -# Make relative paths absolute (needed later on) -FOREACH(p LIB BIN INCLUDE CMAKE) - SET(var INSTALL_${p}_DIR) - IF(NOT IS_ABSOLUTE "${${var}}") - SET(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}") - ENDIF() -ENDFOREACH() - -# set up include-directories -SET (Kokkos_INCLUDE_DIRS - ${Kokkos_SOURCE_DIR}/core/src - ${Kokkos_SOURCE_DIR}/containers/src - ${Kokkos_SOURCE_DIR}/algorithms/src - ${Kokkos_BINARY_DIR} # to find KokkosCore_config.h - ${KOKKOS_INCLUDE_DIRS} -) - -# pass include dirs back to parent scope -SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE) - -INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS}) - -IF(KOKKOS_SEPARATE_LIBS) - # kokkoscore - ADD_LIBRARY( - kokkoscore - ${KOKKOS_CORE_SRCS} - ) - - target_compile_options( - kokkoscore - PUBLIC ${KOKKOS_CXX_FLAGS} - ) - - target_compile_features( - kokkoscore - PUBLIC ${KOKKOS_CXX11_FEATURES} - ) - - # Install the kokkoscore library - INSTALL (TARGETS kokkoscore - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin - ) - - # Install the kokkoscore headers - INSTALL (DIRECTORY - ${Kokkos_SOURCE_DIR}/core/src/ - DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - - # Install KokkosCore_config.h header - INSTALL (FILES - ${Kokkos_BINARY_DIR}/KokkosCore_config.h - DESTINATION ${KOKKOS_HEADER_DIR} - ) - - TARGET_LINK_LIBRARIES( - kokkoscore - ${KOKKOS_LD_FLAGS} - ${KOKKOS_LIBS} - ) - - # kokkoscontainers - ADD_LIBRARY( - kokkoscontainers - ${KOKKOS_CONTAINERS_SRCS} - ) - - TARGET_LINK_LIBRARIES( - kokkoscontainers - kokkoscore - ) - - # Install the kokkocontainers library - INSTALL (TARGETS kokkoscontainers - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) - - # Install the kokkoscontainers headers - INSTALL (DIRECTORY - ${Kokkos_SOURCE_DIR}/containers/src/ - DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - - # kokkosalgorithms - Build as interface library since no source files. - ADD_LIBRARY( - kokkosalgorithms - INTERFACE - ) - - target_include_directories( - kokkosalgorithms - INTERFACE ${Kokkos_SOURCE_DIR}/algorithms/src - ) - - TARGET_LINK_LIBRARIES( - kokkosalgorithms - INTERFACE kokkoscore - ) - - # Install the kokkoalgorithms library - INSTALL (TARGETS kokkosalgorithms - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) - - # Install the kokkosalgorithms headers - INSTALL (DIRECTORY - ${Kokkos_SOURCE_DIR}/algorithms/src/ - DESTINATION ${KOKKOS_INSTALL_INDLUDE_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - - SET (Kokkos_LIBRARIES_NAMES kokkoscore kokkoscontainers kokkosalgorithms) - -ELSE() - # kokkos - ADD_LIBRARY( - kokkos - ${KOKKOS_CORE_SRCS} - ${KOKKOS_CONTAINERS_SRCS} - ) - - target_compile_options( - kokkos - PUBLIC ${KOKKOS_CXX_FLAGS} - ) - - target_compile_features( - kokkos - PUBLIC ${KOKKOS_CXX11_FEATURES} - ) - - TARGET_LINK_LIBRARIES( - kokkos - ${KOKKOS_LD_FLAGS} - ${KOKKOS_LIBS} - ) - - # Install the kokkos library - INSTALL (TARGETS kokkos - EXPORT KokkosTargets - ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib - RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) - - - # Install the kokkos headers - INSTALL (DIRECTORY - EXPORT KokkosTargets - ${Kokkos_SOURCE_DIR}/core/src/ - DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - INSTALL (DIRECTORY - EXPORT KokkosTargets - ${Kokkos_SOURCE_DIR}/containers/src/ - DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - INSTALL (DIRECTORY - EXPORT KokkosTargets - ${Kokkos_SOURCE_DIR}/algorithms/src/ - DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" - ) - - INSTALL (FILES - ${Kokkos_BINARY_DIR}/KokkosCore_config.h - DESTINATION ${KOKKOS_HEADER_DIR} - ) - - include_directories(${Kokkos_BINARY_DIR}) - include_directories(${Kokkos_SOURCE_DIR}/core/src) - include_directories(${Kokkos_SOURCE_DIR}/containers/src) - include_directories(${Kokkos_SOURCE_DIR}/algorithms/src) - - - SET (Kokkos_LIBRARIES_NAMES kokkos) - -endif() - -# Add all targets to the build-tree export set -export(TARGETS ${Kokkos_LIBRARIES_NAMES} - FILE "${Kokkos_BINARY_DIR}/KokkosTargets.cmake") - -# Export the package for use from the build-tree -# (this registers the build-tree with a global CMake-registry) -export(PACKAGE Kokkos) - -# Create the KokkosConfig.cmake and KokkosConfigVersion files -file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}" - "${INSTALL_INCLUDE_DIR}") -# ... for the build tree -set(CONF_INCLUDE_DIRS "${Kokkos_SOURCE_DIR}" "${Kokkos_BINARY_DIR}") -configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" @ONLY) -# ... for the install tree -set(CONF_INCLUDE_DIRS "\${Kokkos_CMAKE_DIR}/${REL_INCLUDE_DIR}") -configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake" @ONLY) - -# Install the KokkosConfig.cmake and KokkosConfigVersion.cmake -install(FILES - "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake" - DESTINATION "${INSTALL_CMAKE_DIR}") - -# Install the export set for use with the install-tree -INSTALL(EXPORT KokkosTargets DESTINATION - "${INSTALL_CMAKE_DIR}") diff --git a/lib/kokkos/cmake/kokkos_build.cmake b/lib/kokkos/cmake/kokkos_build.cmake new file mode 100644 index 0000000000..f31680d6e2 --- /dev/null +++ b/lib/kokkos/cmake/kokkos_build.cmake @@ -0,0 +1,219 @@ +# kokkos_generated_settings.cmake includes the kokkos library itself in KOKKOS_LIBS +# which we do not want to use for the cmake builds so clean this up +string(REGEX REPLACE "-lkokkos" "" KOKKOS_LIBS ${KOKKOS_LIBS}) + +############################ Detect if submodule ############################### +# +# With thanks to StackOverflow: +# http://stackoverflow.com/questions/25199677/how-to-detect-if-current-scope-has-a-parent-in-cmake +# +get_directory_property(HAS_PARENT PARENT_DIRECTORY) +if(HAS_PARENT) + message(STATUS "Submodule build") + SET(KOKKOS_HEADER_DIR "include/kokkos") +else() + message(STATUS "Standalone build") + SET(KOKKOS_HEADER_DIR "include") +endif() + +################################ Handle the actual build ####################### + +SET(INSTALL_LIB_DIR lib CACHE PATH "Installation directory for libraries") +SET(INSTALL_BIN_DIR bin CACHE PATH "Installation directory for executables") +SET(INSTALL_INCLUDE_DIR ${KOKKOS_HEADER_DIR} CACHE PATH + "Installation directory for header files") +IF(WIN32 AND NOT CYGWIN) + SET(DEF_INSTALL_CMAKE_DIR CMake) +ELSE() + SET(DEF_INSTALL_CMAKE_DIR lib/CMake/Kokkos) +ENDIF() + +SET(INSTALL_CMAKE_DIR ${DEF_INSTALL_CMAKE_DIR} CACHE PATH + "Installation directory for CMake files") + +# Make relative paths absolute (needed later on) +FOREACH(p LIB BIN INCLUDE CMAKE) + SET(var INSTALL_${p}_DIR) + IF(NOT IS_ABSOLUTE "${${var}}") + SET(${var} "${CMAKE_INSTALL_PREFIX}/${${var}}") + ENDIF() +ENDFOREACH() + +# set up include-directories +SET (Kokkos_INCLUDE_DIRS + ${Kokkos_SOURCE_DIR}/core/src + ${Kokkos_SOURCE_DIR}/containers/src + ${Kokkos_SOURCE_DIR}/algorithms/src + ${Kokkos_BINARY_DIR} # to find KokkosCore_config.h + ${KOKKOS_INCLUDE_DIRS} +) + +# pass include dirs back to parent scope +if(HAS_PARENT) +SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS} PARENT_SCOPE) +else() +SET(Kokkos_INCLUDE_DIRS_RET ${Kokkos_INCLUDE_DIRS}) +endif() + +INCLUDE_DIRECTORIES(${Kokkos_INCLUDE_DIRS}) + +IF(KOKKOS_SEPARATE_LIBS) + # Sources come from makefile-generated kokkos_generated_settings.cmake file + # Separate libs need to separate the sources + set_kokkos_srcs(KOKKOS_SRC ${KOKKOS_SRC}) + + # kokkoscore + ADD_LIBRARY( + kokkoscore + ${KOKKOS_CORE_SRCS} + ) + + target_compile_options( + kokkoscore + PUBLIC $<$:${KOKKOS_CXX_FLAGS}> + ) + + # Install the kokkoscore library + INSTALL (TARGETS kokkoscore + EXPORT KokkosTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin + ) + + TARGET_LINK_LIBRARIES( + kokkoscore + ${KOKKOS_LD_FLAGS} + ${KOKKOS_EXTRA_LIBS_LIST} + ) + + # kokkoscontainers + if (DEFINED KOKKOS_CONTAINERS_SRCS) + ADD_LIBRARY( + kokkoscontainers + ${KOKKOS_CONTAINERS_SRCS} + ) + endif() + + TARGET_LINK_LIBRARIES( + kokkoscontainers + kokkoscore + ) + + # Install the kokkocontainers library + INSTALL (TARGETS kokkoscontainers + EXPORT KokkosTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + + # kokkosalgorithms - Build as interface library since no source files. + ADD_LIBRARY( + kokkosalgorithms + INTERFACE + ) + + target_include_directories( + kokkosalgorithms + INTERFACE ${Kokkos_SOURCE_DIR}/algorithms/src + ) + + TARGET_LINK_LIBRARIES( + kokkosalgorithms + INTERFACE kokkoscore + ) + + # Install the kokkoalgorithms library + INSTALL (TARGETS kokkosalgorithms + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + + SET (Kokkos_LIBRARIES_NAMES kokkoscore kokkoscontainers kokkosalgorithms) + +ELSE() + # kokkos + ADD_LIBRARY( + kokkos + ${KOKKOS_CORE_SRCS} + ${KOKKOS_CONTAINERS_SRCS} + ) + + target_compile_options( + kokkos + PUBLIC $<$:${KOKKOS_CXX_FLAGS}> + ) + + TARGET_LINK_LIBRARIES( + kokkos + ${KOKKOS_LD_FLAGS} + ${KOKKOS_EXTRA_LIBS_LIST} + ) + + # Install the kokkos library + INSTALL (TARGETS kokkos + EXPORT KokkosTargets + ARCHIVE DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + LIBRARY DESTINATION ${CMAKE_INSTALL_PREFIX}/lib + RUNTIME DESTINATION ${CMAKE_INSTALL_PREFIX}/bin) + + + SET (Kokkos_LIBRARIES_NAMES kokkos) + +endif() # KOKKOS_SEPARATE_LIBS + +# Install the kokkos headers +INSTALL (DIRECTORY + EXPORT KokkosTargets + ${Kokkos_SOURCE_DIR}/core/src/ + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING PATTERN "*.hpp" +) +INSTALL (DIRECTORY + EXPORT KokkosTargets + ${Kokkos_SOURCE_DIR}/containers/src/ + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING PATTERN "*.hpp" +) +INSTALL (DIRECTORY + EXPORT KokkosTargets + ${Kokkos_SOURCE_DIR}/algorithms/src/ + DESTINATION ${KOKKOS_HEADER_DIR} + FILES_MATCHING PATTERN "*.hpp" +) + +INSTALL (FILES + ${Kokkos_BINARY_DIR}/KokkosCore_config.h + DESTINATION ${KOKKOS_HEADER_DIR} +) + +# Add all targets to the build-tree export set +export(TARGETS ${Kokkos_LIBRARIES_NAMES} + FILE "${Kokkos_BINARY_DIR}/KokkosTargets.cmake") + +# Export the package for use from the build-tree +# (this registers the build-tree with a global CMake-registry) +export(PACKAGE Kokkos) + +# Create the KokkosConfig.cmake and KokkosConfigVersion files +file(RELATIVE_PATH REL_INCLUDE_DIR "${INSTALL_CMAKE_DIR}" + "${INSTALL_INCLUDE_DIR}") +# ... for the build tree +set(CONF_INCLUDE_DIRS "${Kokkos_SOURCE_DIR}" "${Kokkos_BINARY_DIR}") +configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in + "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" @ONLY) +# ... for the install tree +set(CONF_INCLUDE_DIRS "\${Kokkos_CMAKE_DIR}/${REL_INCLUDE_DIR}") +configure_file(${Kokkos_SOURCE_DIR}/cmake/KokkosConfig.cmake.in + "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake" @ONLY) + +# Install the KokkosConfig.cmake and KokkosConfigVersion.cmake +install(FILES + "${Kokkos_BINARY_DIR}${CMAKE_FILES_DIRECTORY}/KokkosConfig.cmake" + DESTINATION "${INSTALL_CMAKE_DIR}") + +#This seems not to do anything? +#message(STATUS "KokkosTargets: " ${KokkosTargets}) +# Install the export set for use with the install-tree +INSTALL(EXPORT KokkosTargets DESTINATION + "${INSTALL_CMAKE_DIR}") diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake new file mode 100644 index 0000000000..c0c62ccb6a --- /dev/null +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -0,0 +1,345 @@ +################################### FUNCTIONS ################################## +# List of functions +# set_kokkos_cxx_compiler +# set_kokkos_cxx_standard +# set_kokkos_srcs + +#------------------------------------------------------------------------------- +# function(set_kokkos_cxx_compiler) +# Sets the following compiler variables that are analogous to the CMAKE_* +# versions. We add the ability to detect NVCC (really nvcc_wrapper). +# KOKKOS_CXX_COMPILER +# KOKKOS_CXX_COMPILER_ID +# KOKKOS_CXX_COMPILER_VERSION +# +# Inputs: +# KOKKOS_ENABLE_CUDA +# CMAKE_CXX_COMPILER +# CMAKE_CXX_COMPILER_ID +# CMAKE_CXX_COMPILER_VERSION +# +# Also verifies the compiler version meets the minimum required by Kokkos. +function(set_kokkos_cxx_compiler) + # Since CMake doesn't recognize the nvcc compiler until 3.8, we use our own + # version of the CMake variables and detect nvcc ourselves. Initially set to + # the CMake variable values. + set(INTERNAL_CXX_COMPILER ${CMAKE_CXX_COMPILER}) + set(INTERNAL_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) + set(INTERNAL_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) + + # Check if the compiler is nvcc (which really means nvcc_wrapper). + execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version + COMMAND grep nvcc + COMMAND wc -l + OUTPUT_VARIABLE INTERNAL_HAVE_COMPILER_NVCC + OUTPUT_STRIP_TRAILING_WHITESPACE) + + string(REGEX REPLACE "^ +" "" + INTERNAL_HAVE_COMPILER_NVCC ${INTERNAL_HAVE_COMPILER_NVCC}) + + if(INTERNAL_HAVE_COMPILER_NVCC) + # Set the compiler id to nvcc. We use the value used by CMake 3.8. + set(INTERNAL_CXX_COMPILER_ID NVIDIA) + + # Set nvcc's compiler version. + execute_process(COMMAND ${INTERNAL_CXX_COMPILER} --version + COMMAND grep release + OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE) + + string(REGEX MATCH "[0-9]+\.[0-9]+\.[0-9]+$" + INTERNAL_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() + + # Enforce the minimum compilers supported by Kokkos. + set(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos. Required compiler versions:") + set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang 3.5.2 or higher") + set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC 4.8.4 or higher") + set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel 15.0.2 or higher") + set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC 7.0.28 or higher") + set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n PGI 17.1 or higher\n") + + if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 3.5.2) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + elseif(INTERNAL_CXX_COMPILER_ID STREQUAL GNU) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.8.4) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + elseif(INTERNAL_CXX_COMPILER_ID STREQUAL Intel) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + elseif(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 7.0.28) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + elseif(INTERNAL_CXX_COMPILER_ID STREQUAL PGI) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 17.1) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + endif() + + # Enforce that extensions are turned off for nvcc_wrapper. + if(INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) + if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set CMAKE_CXX_EXTENSIONS to OFF in your CMakeLists.txt.") + endif() + endif() + + if(KOKKOS_ENABLE_CUDA) + # Enforce that the compiler can compile CUDA code. + if(INTERNAL_CXX_COMPILER_ID STREQUAL Clang) + if(INTERNAL_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + elseif(NOT INTERNAL_CXX_COMPILER_ID STREQUAL NVIDIA) + message(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang.") + endif() + endif() + + set(KOKKOS_CXX_COMPILER ${INTERNAL_CXX_COMPILER} PARENT_SCOPE) + set(KOKKOS_CXX_COMPILER_ID ${INTERNAL_CXX_COMPILER_ID} PARENT_SCOPE) + set(KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION} PARENT_SCOPE) +endfunction() + +#------------------------------------------------------------------------------- +# function(set_kokkos_cxx_standard) +# Transitively enforces that the appropriate CXX standard compile flags (C++11 +# or above) are added to targets that use the Kokkos library. Compile features +# are used if possible. Otherwise, the appropriate flags are added to +# KOKKOS_CXX_FLAGS. Values set by the user to CMAKE_CXX_STANDARD and +# CMAKE_CXX_EXTENSIONS are honored. +# +# Outputs: +# KOKKOS_CXX11_FEATURES +# KOKKOS_CXX_FLAGS +# +# Inputs: +# KOKKOS_CXX_COMPILER +# KOKKOS_CXX_COMPILER_ID +# KOKKOS_CXX_COMPILER_VERSION +# +function(set_kokkos_cxx_standard) + # The following table lists the versions of CMake that supports CXX_STANDARD + # and the CXX compile features for different compilers. The versions are + # based on CMake documentation, looking at CMake code, and verifying by + # testing with specific CMake versions. + # + # COMPILER CXX_STANDARD Compile Features + # --------------------------------------------------------------- + # Clang 3.1 3.1 + # GNU 3.1 3.2 + # AppleClang 3.2 3.2 + # Intel 3.6 3.6 + # Cray No No + # PGI No No + # XL No No + # + # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's + # flags for turning on C++11. Since for compiler ID and versioning purposes + # CMake recognizes the host compiler when calling nvcc_wrapper, this just + # works. Both NVCC and nvcc_wrapper only recognize '-std=c++11' which means + # that we can only use host compilers for CUDA builds that use those flags. + # It also means that extensions (gnu++11) can't be turned on for CUDA builds. + + # Check if we can use compile features. + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(CMAKE_CXX_COMPILER_ID STREQUAL Clang) + if(NOT CMAKE_VERSION VERSION_LESS 3.1) + set(INTERNAL_USE_COMPILE_FEATURES ON) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang OR CMAKE_CXX_COMPILER_ID STREQUAL GNU) + if(NOT CMAKE_VERSION VERSION_LESS 3.2) + set(INTERNAL_USE_COMPILE_FEATURES ON) + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) + if(NOT CMAKE_VERSION VERSION_LESS 3.6) + set(INTERNAL_USE_COMPILE_FEATURES ON) + endif() + endif() + endif() + + if(INTERNAL_USE_COMPILE_FEATURES) + # Use the compile features aspect of CMake to transitively cause C++ flags + # to populate to user code. + + # I'm using a hack by requiring features that I know force the lowest version + # of the compilers we want to support. Clang 3.3 and later support all of + # the C++11 standard. With CMake 3.8 and higher, we could switch to using + # cxx_std_11. + set(KOKKOS_CXX11_FEATURES + cxx_nonstatic_member_init # Forces GCC 4.7 or later and Intel 14.0 or later. + PARENT_SCOPE + ) + else() + # CXX compile features are not yet implemented for this combination of + # compiler and version of CMake. + + if(CMAKE_CXX_COMPILER_ID STREQUAL AppleClang) + # Versions of CMAKE before 3.2 don't support CXX_STANDARD or C++ compile + # features for the AppleClang compiler. Set compiler flags transitively + # here such that they trickle down to a call to target_compile_options(). + + # The following two blocks of code were copied from + # /Modules/Compiler/AppleClang-CXX.cmake from CMake 3.7.2 and then + # modified. + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 4.0) + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=gnu++11") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 6.1) + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++14") + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 5.1) + # AppleClang 5.0 knows this flag, but does not set a __cplusplus macro + # greater than 201103L. + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++1y") + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=gnu++1y") + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL Intel) + # Versions of CMAKE before 3.6 don't support CXX_STANDARD or C++ compile + # features for the Intel compiler. Set compiler flags transitively here + # such that they trickle down to a call to target_compile_options(). + + # The following three blocks of code were copied from + # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. + if("x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.2) + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++14") + # TODO: There is no gnu++14 value supported; figure out what to do. + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++14") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 15.0.0) + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "${_std}=c++1y") + # TODO: There is no gnu++14 value supported; figure out what to do. + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "${_std}=c++1y") + endif() + + if(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 13.0) + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++11") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}11") + elseif(NOT CMAKE_CXX_COMPILER_VERSION VERSION_LESS 12.1) + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "${_std}=c++0x") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "${_std}=${_ext}0x") + endif() + elseif(CMAKE_CXX_COMPILER_ID STREQUAL Cray) + # CMAKE doesn't support CXX_STANDARD or C++ compile features for the Cray + # compiler. Set compiler options transitively here such that they trickle + # down to a call to target_compile_options(). + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-hstd=c++11") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-hstd=c++11") + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-hstd=c++11") + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-hstd=c++11") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL PGI) + # CMAKE doesn't support CXX_STANDARD or C++ compile features for the PGI + # compiler. Set compiler options transitively here such that they trickle + # down to a call to target_compile_options(). + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "--c++11") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "--c++11") + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "--c++11") + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "--c++11") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL XL) + # CMAKE doesn't support CXX_STANDARD or C++ compile features for the XL + # compiler. Set compiler options transitively here such that they trickle + # down to a call to target_compile_options(). + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION "-std=c++11") + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION "-std=c++11") + set(INTERNAL_CXX14_STANDARD_COMPILE_OPTION "-std=c++11") + set(INTERNAL_CXX14_EXTENSION_COMPILE_OPTION "-std=c++11") + else() + # Assume GNU. CMAKE_CXX_STANDARD is handled correctly by CMake 3.1 and + # above for this compiler. If the user explicitly requests a C++ + # standard, CMake takes care of it. If not, transitively require C++11. + if(NOT CMAKE_CXX_STANDARD) + set(INTERNAL_CXX11_STANDARD_COMPILE_OPTION ${CMAKE_CXX11_STANDARD_COMPILE_OPTION}) + set(INTERNAL_CXX11_EXTENSION_COMPILE_OPTION ${CMAKE_CXX11_EXTENSION_COMPILE_OPTION}) + endif() + endif() + + # Set the C++ standard info for Kokkos respecting user set values for + # CMAKE_CXX_STANDARD and CMAKE_CXX_EXTENSIONS. + # Only use cxx extension if explicitly requested + if(CMAKE_CXX_STANDARD EQUAL 14) + if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON) + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_EXTENSION_COMPILE_OPTION}) + else() + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX14_STANDARD_COMPILE_OPTION}) + endif() + elseif(CMAKE_CXX_STANDARD EQUAL 11) + if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON) + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION}) + else() + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION}) + endif() + else() + # The user didn't explicitly request a standard, transitively require + # C++11 respecting CMAKE_CXX_EXTENSIONS. + if(DEFINED CMAKE_CXX_EXTENSIONS AND CMAKE_CXX_EXTENSIONS STREQUAL ON) + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_EXTENSION_COMPILE_OPTION}) + else() + set(INTERNAL_CXX_FLAGS ${INTERNAL_CXX11_STANDARD_COMPILE_OPTION}) + endif() + endif() + + set(KOKKOS_CXX_FLAGS ${INTERNAL_CXX_FLAGS} PARENT_SCOPE) + endif() +endfunction() + + +#------------------------------------------------------------------------------- +# function(set_kokkos_sources) +# Takes a list of sources for kokkos (e.g., KOKKOS_SRC from Makefile.kokkos and +# put it into kokkos_generated_settings.cmake) and sorts the files into the subpackages or +# separate_libraries. This is core and containers (algorithms is pure header +# files). +# +# Inputs: +# KOKKOS_SRC +# +# Outputs: +# KOKKOS_CORE_SRCS +# KOKKOS_CONTAINERS_SRCS +# +function(set_kokkos_srcs) + set(opts ) # no-value args + set(oneValArgs ) + set(multValArgs KOKKOS_SRC) # e.g., lists + cmake_parse_arguments(IN "${opts}" "${oneValArgs}" "${multValArgs}" ${ARGN}) + + foreach(sfile ${IN_KOKKOS_SRC}) + string(REPLACE "${CMAKE_CURRENT_SOURCE_DIR}/" "" stripfile "${sfile}") + string(REPLACE "/" ";" striplist "${stripfile}") + list(GET striplist 0 firstdir) + if(${firstdir} STREQUAL "core") + list(APPEND KOKKOS_CORE_SRCS ${sfile}) + else() + list(APPEND KOKKOS_CONTAINERS_SRCS ${sfile}) + endif() + endforeach() + set(KOKKOS_CORE_SRCS ${KOKKOS_CORE_SRCS} PARENT_SCOPE) + set(KOKKOS_CONTAINERS_SRCS ${KOKKOS_CONTAINERS_SRCS} PARENT_SCOPE) + return() +endfunction() + +# Setting a default value if it is not already set +macro(set_kokkos_default_default VARIABLE DEFAULT) + IF( "${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}" STREQUAL "" ) + IF( "${KOKKOS_ENABLE_${VARIABLE}}" STREQUAL "" ) + set(KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT ${DEFAULT}) + # MESSAGE(WARNING "Set: KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT to ${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}") + ELSE() + set(KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT ${KOKKOS_ENABLE_${VARIABLE}}) + # MESSAGE(WARNING "Set: KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT to ${KOKKOS_INTERNAL_ENABLE_${VARIABLE}_DEFAULT}") + ENDIF() + ENDIF() + UNSET(KOKKOS_ENABLE_${VARIABLE} CACHE) +endmacro() diff --git a/lib/kokkos/cmake/kokkos_options.cmake b/lib/kokkos/cmake/kokkos_options.cmake new file mode 100644 index 0000000000..f17710a4ce --- /dev/null +++ b/lib/kokkos/cmake/kokkos_options.cmake @@ -0,0 +1,365 @@ +########################## NOTES ############################################### +# List the options for configuring kokkos using CMake method of doing it. +# These options then get mapped onto KOKKOS_SETTINGS environment variable by +# kokkos_settings.cmake. It is separate to allow other packages to override +# these variables (e.g., TriBITS). + +########################## AVAILABLE OPTIONS ################################### +# Use lists for documentation, verification, and programming convenience + +# All CMake options of the type KOKKOS_ENABLE_* +set(KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST) +list(APPEND KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST + Serial + OpenMP + Pthread + Qthread + Cuda + ROCm + HWLOC + MEMKIND + LIBRT + Cuda_Lambda + Cuda_Relocatable_Device_Code + Cuda_UVM + Cuda_LDG_Intrinsic + Debug + Debug_DualView_Modify_Check + Debug_Bounds_Checkt + Compiler_Warnings + Profiling + Profiling_Load_Print + Aggressive_Vectorization + ) + +#------------------------------------------------------------------------------- +#------------------------------- Recognize CamelCase Options --------------------------- +#------------------------------------------------------------------------------- + +foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST}) + string(TOUPPER ${opt} OPT ) + IF(DEFINED Kokkos_ENABLE_${opt}) + IF(DEFINED KOKKOS_ENABLE_${OPT}) + IF(NOT ("${KOKKOS_ENABLE_${OPT}}" STREQUAL "${Kokkos_ENABLE_${opt}}")) + IF(DEFINED KOKKOS_ENABLE_${OPT}_INTERNAL) + MESSAGE(WARNING "Defined both Kokkos_ENABLE_${opt}=[${Kokkos_ENABLE_${opt}}] and KOKKOS_ENABLE_${OPT}=[${KOKKOS_ENABLE_${OPT}}] and they differ! Could be caused by old CMakeCache Variable. Run CMake again and warning should disappear. If not you are truly setting both variables.") + IF(NOT ("${Kokkos_ENABLE_${opt}}" STREQUAL "${KOKKOS_ENABLE_${OPT}_INTERNAL}")) + UNSET(KOKKOS_ENABLE_${OPT} CACHE) + SET(KOKKOS_ENABLE_${OPT} ${Kokkos_ENABLE_${opt}}) + MESSAGE(WARNING "SET BOTH VARIABLES KOKKOS_ENABLE_${OPT}: ${KOKKOS_ENABLE_${OPT}}") + ELSE() + SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}}) + ENDIF() + ELSE() + MESSAGE(FATAL_ERROR "Defined both Kokkos_ENABLE_${opt}=[${Kokkos_ENABLE_${opt}}] and KOKKOS_ENABLE_${OPT}=[${KOKKOS_ENABLE_${OPT}}] and they differ!") + ENDIF() + ENDIF() + ELSE() + SET(KOKKOS_INTERNAL_ENABLE_${OPT}_DEFAULT ${Kokkos_ENABLE_${opt}}) + ENDIF() + ENDIF() +endforeach() + +IF(DEFINED Kokkos_Arch) + IF(DEFINED KOKKOS_ARCH) + IF(NOT (${KOKKOS_ARCH} STREQUAL "${Kokkos_Arch}")) + MESSAGE(FATAL_ERROR "Defined both Kokkos_Arch and KOKKOS_ARCH and they differ!") + ENDIF() + ELSE() + SET(KOKKOS_ARCH ${Kokkos_Arch}) + ENDIF() +ENDIF() + +#------------------------------------------------------------------------------- +# List of possible host architectures. +#------------------------------------------------------------------------------- +set(KOKKOS_ARCH_LIST) +list(APPEND KOKKOS_ARCH_LIST + None # No architecture optimization + AMDAVX # (HOST) AMD chip + ARMv80 # (HOST) ARMv8.0 Compatible CPU + ARMv81 # (HOST) ARMv8.1 Compatible CPU + ARMv8-ThunderX # (HOST) ARMv8 Cavium ThunderX CPU + WSM # (HOST) Intel Westmere CPU + SNB # (HOST) Intel Sandy/Ivy Bridge CPUs + HSW # (HOST) Intel Haswell CPUs + BDW # (HOST) Intel Broadwell Xeon E-class CPUs + SKX # (HOST) Intel Sky Lake Xeon E-class HPC CPUs (AVX512) + KNC # (HOST) Intel Knights Corner Xeon Phi + KNL # (HOST) Intel Knights Landing Xeon Phi + BGQ # (HOST) IBM Blue Gene Q + Power7 # (HOST) IBM POWER7 CPUs + Power8 # (HOST) IBM POWER8 CPUs + Power9 # (HOST) IBM POWER9 CPUs + Kepler # (GPU) NVIDIA Kepler default (generation CC 3.5) + Kepler30 # (GPU) NVIDIA Kepler generation CC 3.0 + Kepler32 # (GPU) NVIDIA Kepler generation CC 3.2 + Kepler35 # (GPU) NVIDIA Kepler generation CC 3.5 + Kepler37 # (GPU) NVIDIA Kepler generation CC 3.7 + Maxwell # (GPU) NVIDIA Maxwell default (generation CC 5.0) + Maxwell50 # (GPU) NVIDIA Maxwell generation CC 5.0 + Maxwell52 # (GPU) NVIDIA Maxwell generation CC 5.2 + Maxwell53 # (GPU) NVIDIA Maxwell generation CC 5.3 + Pascal60 # (GPU) NVIDIA Pascal generation CC 6.0 + Pascal61 # (GPU) NVIDIA Pascal generation CC 6.1 + ) + +# List of possible device architectures. +# The case and spelling here needs to match Makefile.kokkos +set(KOKKOS_DEVICES_LIST) +# Options: Cuda,ROCm,OpenMP,Pthread,Qthreads,Serial +list(APPEND KOKKOS_DEVICES_LIST + Cuda # NVIDIA GPU -- see below + OpenMP # OpenMP + Pthread # pthread + Qthreads # qthreads + Serial # serial + ROCm # Relocatable device code + ) + +# List of possible TPLs for Kokkos +# From Makefile.kokkos: Options: hwloc,librt,experimental_memkind +set(KOKKOS_USE_TPLS_LIST) +list(APPEND KOKKOS_USE_TPLS_LIST + HWLOC # hwloc + LIBRT # librt + MEMKIND # experimental_memkind + ) +# Map of cmake variables to Makefile variables +set(KOKKOS_INTERNAL_HWLOC hwloc) +set(KOKKOS_INTERNAL_LIBRT librt) +set(KOKKOS_INTERNAL_MEMKIND experimental_memkind) + +# List of possible Advanced options +set(KOKKOS_OPTIONS_LIST) +list(APPEND KOKKOS_OPTIONS_LIST + AGGRESSIVE_VECTORIZATION + DISABLE_PROFILING + DISABLE_DUALVIEW_MODIFY_CHECK + ENABLE_PROFILE_LOAD_PRINT + ) +# Map of cmake variables to Makefile variables +set(KOKKOS_INTERNAL_LDG_INTRINSIC use_ldg) +set(KOKKOS_INTERNAL_UVM librt) +set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc) + + +#------------------------------------------------------------------------------- +# List of possible Options for CUDA +#------------------------------------------------------------------------------- +# From Makefile.kokkos: Options: use_ldg,force_uvm,rdc +set(KOKKOS_CUDA_OPTIONS_LIST) +list(APPEND KOKKOS_CUDA_OPTIONS_LIST + LDG_INTRINSIC # use_ldg + UVM # force_uvm + RELOCATABLE_DEVICE_CODE # rdc + LAMBDA # enable_lambda + ) + +# Map of cmake variables to Makefile variables +set(KOKKOS_INTERNAL_LDG_INTRINSIC use_ldg) +set(KOKKOS_INTERNAL_UVM force_uvm) +set(KOKKOS_INTERNAL_RELOCATABLE_DEVICE_CODE rdc) +set(KOKKOS_INTERNAL_LAMBDA enable_lambda) + + +#------------------------------------------------------------------------------- +#------------------------------- Create doc strings ---------------------------- +#------------------------------------------------------------------------------- + +set(tmpr "\n ") +string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_ARCH_DOCSTR "${KOKKOS_ARCH_LIST}") +# This would be useful, but we use Foo_ENABLE mechanisms +#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_DEVICES_DOCSTR "${KOKKOS_DEVICES_LIST}") +#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_USE_TPLS_DOCSTR "${KOKKOS_USE_TPLS_LIST}") +#string(REPLACE ";" ${tmpr} KOKKOS_INTERNAL_CUDA_OPTIONS_DOCSTR "${KOKKOS_CUDA_OPTIONS_LIST}") + +#------------------------------------------------------------------------------- +#------------------------------- GENERAL OPTIONS ------------------------------- +#------------------------------------------------------------------------------- + +# Setting this variable to a value other than "None" can improve host +# performance by turning on architecture specific code. +# NOT SET is used to determine if the option is passed in. It is reset to +# default "None" down below. +set(KOKKOS_ARCH "NOT_SET" CACHE STRING + "Optimize for specific host architecture. Options are: ${KOKKOS_INTERNAL_ARCH_DOCSTR}") + +# Whether to build separate libraries or now +set(KOKKOS_SEPARATE_LIBS OFF CACHE BOOL "OFF = kokkos. ON = kokkoscore, kokkoscontainers, and kokkosalgorithms.") + +# Qthreads options. +set(KOKKOS_QTHREADS_DIR "" CACHE PATH "Location of Qthreads library.") + + +#------------------------------------------------------------------------------- +#------------------------------- KOKKOS_DEVICES -------------------------------- +#------------------------------------------------------------------------------- +# Figure out default settings +IF(Trilinos_ENABLE_Kokkos) + set_kokkos_default_default(SERIAL ON) + set_kokkos_default_default(PTHREAD OFF) + IF(TPL_ENABLE_QTHREAD) + set_kokkos_default_default(QTHREADS ${TPL_ENABLE_QTHREAD}) + ELSE() + set_kokkos_default_default(QTHREADS OFF) + ENDIF() + IF(Trilinos_ENABLE_OpenMP) + set_kokkos_default_default(OPENMP ${Trilinos_ENABLE_OpenMP}) + ELSE() + set_kokkos_default_default(OPENMP OFF) + ENDIF() + IF(TPL_ENABLE_CUDA) + set_kokkos_default_default(CUDA ${TPL_ENABLE_CUDA}) + ELSE() + set_kokkos_default_default(CUDA OFF) + ENDIF() + set_kokkos_default_default(ROCM OFF) +ELSE() + set_kokkos_default_default(SERIAL ON) + set_kokkos_default_default(OPENMP OFF) + set_kokkos_default_default(PTHREAD OFF) + set_kokkos_default_default(QTHREAD OFF) + set_kokkos_default_default(CUDA OFF) + set_kokkos_default_default(ROCM OFF) +ENDIF() + +# Set which Kokkos backend to use. +# These are the actual options that define the settings. +set(KOKKOS_ENABLE_SERIAL ${KOKKOS_INTERNAL_ENABLE_SERIAL_DEFAULT} CACHE BOOL "Whether to enable the Kokkos::Serial device. This device executes \"parallel\" kernels sequentially on a single CPU thread. It is enabled by default. If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads.") +set(KOKKOS_ENABLE_OPENMP ${KOKKOS_INTERNAL_ENABLE_OPENMP_DEFAULT} CACHE BOOL "Enable OpenMP support in Kokkos." FORCE) +set(KOKKOS_ENABLE_PTHREAD ${KOKKOS_INTERNAL_ENABLE_PTHREAD_DEFAULT} CACHE BOOL "Enable Pthread support in Kokkos.") +set(KOKKOS_ENABLE_QTHREADS ${KOKKOS_INTERNAL_ENABLE_QTHREADS_DEFAULT} CACHE BOOL "Enable Qthreads support in Kokkos.") +set(KOKKOS_ENABLE_CUDA ${KOKKOS_INTERNAL_ENABLE_CUDA_DEFAULT} CACHE BOOL "Enable CUDA support in Kokkos.") +set(KOKKOS_ENABLE_ROCM ${KOKKOS_INTERNAL_ENABLE_ROCM_DEFAULT} CACHE BOOL "Enable ROCm support in Kokkos.") + + + +#------------------------------------------------------------------------------- +#------------------------------- KOKKOS DEBUG and PROFILING -------------------- +#------------------------------------------------------------------------------- + +# Debug related options enable compiler warnings + +set_kokkos_default_default(DEBUG OFF) +set(KOKKOS_ENABLE_DEBUG ${KOKKOS_INTERNAL_ENABLE_DEBUG_DEFAULT} CACHE BOOL "Enable Kokkos Debug.") + +# From Makefile.kokkos: Advanced Options: +#compiler_warnings, aggressive_vectorization, disable_profiling, disable_dualview_modify_check, enable_profile_load_print +set_kokkos_default_default(COMPILER_WARNINGS OFF) +set(KOKKOS_ENABLE_COMPILER_WARNINGS ${KOKKOS_INTERNAL_ENABLE_COMPILER_WARNINGS_DEFAULT} CACHE BOOL "Enable compiler warnings.") + +set_kokkos_default_default(DEBUG_DUALVIEW_MODIFY_CHECK OFF) +set(KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK ${KOKKOS_INTERNAL_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK_DEFAULT} CACHE BOOL "Enable dualview modify check.") + +# Enable aggressive vectorization. +set_kokkos_default_default(AGGRESSIVE_VECTORIZATION OFF) +set(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION ${KOKKOS_INTERNAL_ENABLE_AGGRESSIVE_VECTORIZATION_DEFAULT} CACHE BOOL "Enable aggressive vectorization.") + +# Enable profiling. +set_kokkos_default_default(PROFILING ON) +set(KOKKOS_ENABLE_PROFILING ${KOKKOS_INTERNAL_ENABLE_PROFILING_DEFAULT} CACHE BOOL "Enable profiling.") + +set_kokkos_default_default(PROFILING_LOAD_PRINT OFF) +set(KOKKOS_ENABLE_PROFILING_LOAD_PRINT ${KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT_DEFAULT} CACHE BOOL "Enable profile load print.") + + + + +#------------------------------------------------------------------------------- +#------------------------------- KOKKOS_USE_TPLS ------------------------------- +#------------------------------------------------------------------------------- +# Enable hwloc library. +# Figure out default: +IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HWLOC) + set_kokkos_default_default(HWLOC ON) +ELSE() + set_kokkos_default_default(HWLOC OFF) +ENDIF() +set(KOKKOS_ENABLE_HWLOC ${KOKKOS_INTERNAL_ENABLE_HWLOC_DEFAULT} CACHE BOOL "Enable hwloc for better process placement.") +set(KOKKOS_HWLOC_DIR "" CACHE PATH "Location of hwloc library. (kokkos tpl)") + +# Enable memkind library. +set_kokkos_default_default(MEMKIND OFF) +set(KOKKOS_ENABLE_MEMKIND ${KOKKOS_INTERNAL_ENABLE_MEMKIND_DEFAULT} CACHE BOOL "Enable memkind. (kokkos tpl)") +set(KOKKOS_MEMKIND_DIR "" CACHE PATH "Location of memkind library. (kokkos tpl)") + +# Enable rt library. +IF(Trilinos_ENABLE_Kokkos) + IF(DEFINED TPL_ENABLE_LIBRT) + set_kokkos_default_default(LIBRT ${TPL_ENABLE_LIBRT}) + ELSE() + set_kokkos_default_default(LIBRT OFF) + ENDIF() +ELSE() + set_kokkos_default_default(LIBRT ON) +ENDIF() +set(KOKKOS_ENABLE_LIBRT ${KOKKOS_INTERNAL_ENABLE_LIBRT_DEFAULT} CACHE BOOL "Enable librt for more precise timer. (kokkos tpl)") + + +#------------------------------------------------------------------------------- +#------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- +#------------------------------------------------------------------------------- + +# CUDA options. +# Set Defaults +set_kokkos_default_default(CUDA_LDG_INTRINSIC_DEFAULT OFF) +set_kokkos_default_default(CUDA_UVM_DEFAULT OFF) +set_kokkos_default_default(CUDA_RELOCATABLE_DEVICE_CODE OFF) +IF(Trilinos_ENABLE_Kokkos) + IF(KOKKOS_ENABLE_CUDA) + find_package(CUDA) + ENDIF() + IF (DEFINED CUDA_VERSION) + IF (CUDA_VERSION VERSION_GREATER "7.0") + set_kokkos_default_default(CUDA_LAMBDA ON) + ELSE() + set_kokkos_default_default(CUDA_LAMBDA OFF) + ENDIF() + ENDIF() +ELSE() + set_kokkos_default_default(CUDA_LAMBDA OFF) +ENDIF() + +# Set actual options +set(KOKKOS_CUDA_DIR "" CACHE PATH "Location of CUDA library. Defaults to where nvcc installed.") +set(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC ${KOKKOS_INTERNAL_ENABLE_CUDA_LDG_INTRINSIC_DEFAULT} CACHE BOOL "Enable CUDA LDG. (cuda option)") +set(KOKKOS_ENABLE_CUDA_UVM ${KOKKOS_INTERNAL_ENABLE_CUDA_UVM_DEFAULT} CACHE BOOL "Enable CUDA unified virtual memory.") +set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ${KOKKOS_INTERNAL_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE_DEFAULT} CACHE BOOL "Enable relocatable device code for CUDA. (cuda option)") +set(KOKKOS_ENABLE_CUDA_LAMBDA ${KOKKOS_INTERNAL_ENABLE_CUDA_LAMBDA_DEFAULT} CACHE BOOL "Enable lambdas for CUDA. (cuda option)") + + +#------------------------------------------------------------------------------- +#----------------------- HOST ARCH AND LEGACY TRIBITS -------------------------- +#------------------------------------------------------------------------------- + +# This defines the previous legacy TriBITS builds. +set(KOKKOS_LEGACY_TRIBITS False) +IF ("${KOKKOS_ARCH}" STREQUAL "NOT_SET") + set(KOKKOS_ARCH "None") + IF(KOKKOS_HAS_TRILINOS) + set(KOKKOS_LEGACY_TRIBITS True) + ENDIF() +ENDIF() +IF (KOKKOS_HAS_TRILINOS) + IF (KOKKOS_LEGACY_TRIBITS) + message(STATUS "Using the legacy tribits build because KOKKOS_ARCH not set") + ELSE() + message(STATUS "NOT using the legacy tribits build because KOKKOS_ARCH *is* set") + ENDIF() +ENDIF() + +#------------------------------------------------------------------------------- +#----------------------- Set CamelCase Options if they are not yet set --------- +#------------------------------------------------------------------------------- + +foreach(opt ${KOKKOS_INTERNAL_ENABLE_OPTIONS_LIST}) + string(TOUPPER ${opt} OPT ) + UNSET(KOKKOS_ENABLE_${OPT}_INTERNAL CACHE) + SET(KOKKOS_ENABLE_${OPT}_INTERNAL ${KOKKOS_ENABLE_${OPT}} CACHE BOOL INTERNAL) + IF(DEFINED KOKKOS_ENABLE_${OPT}) + UNSET(Kokkos_ENABLE_${opt} CACHE) + SET(Kokkos_ENABLE_${opt} ${KOKKOS_ENABLE_${OPT}} CACHE BOOL "CamelCase Compatibility setting for KOKKOS_ENABLE_${OPT}") + ENDIF() +endforeach() + diff --git a/lib/kokkos/cmake/kokkos_settings.cmake b/lib/kokkos/cmake/kokkos_settings.cmake new file mode 100644 index 0000000000..850a74a670 --- /dev/null +++ b/lib/kokkos/cmake/kokkos_settings.cmake @@ -0,0 +1,257 @@ +########################## NOTES ############################################### +# This files goal is to take CMake options found in kokkos_options.cmake but +# possibly set from elsewhere +# (see: trilinos/cmake/ProjectCOmpilerPostConfig.cmake) +# using CMake idioms and map them onto the KOKKOS_SETTINGS variables that gets +# passed to the kokkos makefile configuration: +# make -f ${CMAKE_SOURCE_DIR}/core/src/Makefile ${KOKKOS_SETTINGS} build-makefile-cmake-kokkos +# that generates KokkosCore_config.h and kokkos_generated_settings.cmake +# To understand how to form KOKKOS_SETTINGS, see +# /Makefile.kokkos + +#------------------------------------------------------------------------------- +#------------------------------- GENERAL OPTIONS ------------------------------- +#------------------------------------------------------------------------------- + +# Ensure that KOKKOS_ARCH is in the ARCH_LIST +foreach(arch ${KOKKOS_ARCH}) + list(FIND KOKKOS_ARCH_LIST ${arch} indx) + if (indx EQUAL -1) + message(FATAL_ERROR "${arch} is not an accepted value for KOKKOS_ARCH." + " Please pick from these choices: ${KOKKOS_INTERNAL_ARCH_DOCSTR}") + endif () +endforeach() + +# KOKKOS_SETTINGS uses KOKKOS_ARCH +string(REPLACE ";" "," KOKKOS_ARCH "${KOKKOS_ARCH}") +set(KOKKOS_ARCH ${KOKKOS_ARCH}) + +# From Makefile.kokkos: Options: yes,no +if(${KOKKOS_ENABLE_DEBUG}) + set(KOKKOS_DEBUG yes) +else() + set(KOKKOS_DEBUG no) +endif() + +#------------------------------- KOKKOS_DEVICES -------------------------------- +# Can have multiple devices +set(KOKKOS_DEVICESl) +foreach(devopt ${KOKKOS_DEVICES_LIST}) + string(TOUPPER ${devopt} devoptuc) + if (${KOKKOS_ENABLE_${devoptuc}}) + list(APPEND KOKKOS_DEVICESl ${devopt}) + endif () +endforeach() +# List needs to be comma-delmitted +string(REPLACE ";" "," KOKKOS_DEVICES "${KOKKOS_DEVICESl}") + +#------------------------------- KOKKOS_OPTIONS -------------------------------- +# From Makefile.kokkos: Options: aggressive_vectorization,disable_profiling +#compiler_warnings, aggressive_vectorization, disable_profiling, disable_dualview_modify_check, enable_profile_load_print + +set(KOKKOS_OPTIONSl) +if(${KOKKOS_ENABLE_COMPILER_WARNINGS}) + list(APPEND KOKKOS_OPTIONSl compiler_warnings) +endif() +if(${KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION}) + list(APPEND KOKKOS_OPTIONSl aggressive_vectorization) +endif() +if(NOT ${KOKKOS_ENABLE_PROFILING}) + list(APPEND KOKKOS_OPTIONSl disable_vectorization) +endif() +if(NOT ${KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK}) + list(APPEND KOKKOS_OPTIONSl disable_dualview_modify_check) +endif() +if(${KOKKOS_ENABLE_PROFILING_LOAD_PRINT}) + list(APPEND KOKKOS_OPTIONSl enable_profile_load_print) +endif() +# List needs to be comma-delimitted +string(REPLACE ";" "," KOKKOS_OPTIONS "${KOKKOS_OPTIONSl}") + + +#------------------------------- KOKKOS_USE_TPLS ------------------------------- +# Construct the Makefile options +set(KOKKOS_USE_TPLSl) +foreach(tplopt ${KOKKOS_USE_TPLS_LIST}) + if (${KOKKOS_ENABLE_${tplopt}}) + list(APPEND KOKKOS_USE_TPLSl ${KOKKOS_INTERNAL_${tplopt}}) + endif () +endforeach() +# List needs to be comma-delimitted +string(REPLACE ";" "," KOKKOS_USE_TPLS "${KOKKOS_USE_TPLSl}") + + +#------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- +# Construct the Makefile options +set(KOKKOS_CUDA_OPTIONS) +foreach(cudaopt ${KOKKOS_CUDA_OPTIONS_LIST}) + if (${KOKKOS_ENABLE_CUDA_${cudaopt}}) + list(APPEND KOKKOS_CUDA_OPTIONSl ${KOKKOS_INTERNAL_${cudaopt}}) + endif () +endforeach() +# List needs to be comma-delmitted +string(REPLACE ";" "," KOKKOS_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONSl}") + +#------------------------------- PATH VARIABLES -------------------------------- +# Want makefile to use same executables specified which means modifying +# the path so the $(shell ...) commands in the makefile see the right exec +# Also, the Makefile's use FOO_PATH naming scheme for -I/-L construction +#TODO: Makefile.kokkos allows this to be overwritten? ROCM_HCC_PATH + +set(KOKKOS_INTERNAL_PATHS) +set(addpathl) +foreach(kvar "CUDA;QTHREADS;${KOKKOS_USE_TPLS_LIST}") + if(${KOKKOS_ENABLE_${kvar}}) + if(DEFINED KOKKOS_${kvar}_DIR) + set(KOKKOS_INTERNAL_PATHS "${KOKKOS_INTERNAL_PATHS} ${kvar}_PATH=${KOKKOS_${kvar}_DIR}") + if(IS_DIRECTORY ${KOKKOS_${kvar}_DIR}/bin) + list(APPEND addpathl ${KOKKOS_${kvar}_DIR}/bin) + endif() + endif() + endif() +endforeach() +# Path env is : delimitted +string(REPLACE ";" ":" KOKKOS_INTERNAL_ADDTOPATH "${addpathl}") + + +######################### SET KOKKOS_SETTINGS ################################## +# Set the KOKKOS_SETTINGS String -- this is the primary communication with the +# makefile configuration. See Makefile.kokkos + +set(KOKKOS_SETTINGS KOKKOS_SRC_PATH=${KOKKOS_SRC_PATH}) +set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_PATH=${KOKKOS_PATH}) +set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} KOKKOS_INSTALL_PATH=${CMAKE_INSTALL_PREFIX}) + +# Form of KOKKOS_foo=$KOKKOS_foo +foreach(kvar ARCH;DEVICES;DEBUG;OPTIONS;CUDA_OPTIONS;USE_TPLS) + set(KOKKOS_VAR KOKKOS_${kvar}) + if(DEFINED KOKKOS_${kvar}) + if (NOT "${${KOKKOS_VAR}}" STREQUAL "") + set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${KOKKOS_VAR}=${${KOKKOS_VAR}}) + endif() + endif() +endforeach() + +# Form of VAR=VAL +#TODO: Makefile supports MPICH_CXX, OMPI_CXX as well +foreach(ovar CXX;CXXFLAGS;LDFLAGS) + if(DEFINED ${ovar}) + if (NOT "${${ovar}}" STREQUAL "") + set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${ovar}=${${ovar}}) + endif() + endif() +endforeach() + +# Finally, do the paths +if (NOT "${KOKKOS_INTERNAL_PATHS}" STREQUAL "") + set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ${KOKKOS_INTERNAL_PATHS}) +endif() +if (NOT "${KOKKOS_INTERNAL_ADDTOPATH}" STREQUAL "") + set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} PATH=${KOKKOS_INTERNAL_ADDTOPATH}:\${PATH}) +endif() + +# Final form that gets passed to make +set(KOKKOS_SETTINGS env ${KOKKOS_SETTINGS}) + + +############################ PRINT CONFIGURE STATUS ############################ + +if(KOKKOS_CMAKE_VERBOSE) + message(STATUS "") + message(STATUS "****************** Kokkos Settings ******************") + message(STATUS "Execution Spaces") + + if(KOKKOS_ENABLE_CUDA) + message(STATUS " Device Parallel: Cuda") + else() + message(STATUS " Device Parallel: None") + endif() + + if(KOKKOS_ENABLE_OPENMP) + message(STATUS " Host Parallel: OpenMP") + elseif(KOKKOS_ENABLE_PTHREAD) + message(STATUS " Host Parallel: Pthread") + elseif(KOKKOS_ENABLE_QTHREADS) + message(STATUS " Host Parallel: Qthreads") + else() + message(STATUS " Host Parallel: None") + endif() + + if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: Serial") + else() + message(STATUS " Host Serial: None") + endif() + + message(STATUS "") + message(STATUS "Architectures:") + message(STATUS " ${KOKKOS_ARCH}") + + message(STATUS "") + message(STATUS "Enabled options") + + if(KOKKOS_SEPARATE_LIBS) + message(STATUS " KOKKOS_SEPARATE_LIBS") + endif() + + if(KOKKOS_ENABLE_HWLOC) + message(STATUS " KOKKOS_ENABLE_HWLOC") + endif() + + if(KOKKOS_ENABLE_MEMKIND) + message(STATUS " KOKKOS_ENABLE_MEMKIND") + endif() + + if(KOKKOS_ENABLE_DEBUG) + message(STATUS " KOKKOS_ENABLE_DEBUG") + endif() + + if(KOKKOS_ENABLE_PROFILING) + message(STATUS " KOKKOS_ENABLE_PROFILING") + endif() + + if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + message(STATUS " KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION") + endif() + + if(KOKKOS_ENABLE_CUDA) + if(KOKKOS_ENABLE_CUDA_LDG_INTRINSIC) + message(STATUS " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC") + endif() + + if(KOKKOS_ENABLE_CUDA_UVM) + message(STATUS " KOKKOS_ENABLE_CUDA_UVM") + endif() + + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + message(STATUS " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE") + endif() + + if(KOKKOS_ENABLE_CUDA_LAMBDA) + message(STATUS " KOKKOS_ENABLE_CUDA_LAMBDA") + endif() + + if(KOKKOS_CUDA_DIR) + message(STATUS " KOKKOS_CUDA_DIR: ${KOKKOS_CUDA_DIR}") + endif() + endif() + + if(KOKKOS_QTHREADS_DIR) + message(STATUS " KOKKOS_QTHREADS_DIR: ${KOKKOS_QTHREADS_DIR}") + endif() + + if(KOKKOS_HWLOC_DIR) + message(STATUS " KOKKOS_HWLOC_DIR: ${KOKKOS_HWLOC_DIR}") + endif() + + if(KOKKOS_MEMKIND_DIR) + message(STATUS " KOKKOS_MEMKIND_DIR: ${KOKKOS_MEMKIND_DIR}") + endif() + + message(STATUS "") + message(STATUS "Final kokkos settings variable:") + message(STATUS " ${KOKKOS_SETTINGS}") + + message(STATUS "*****************************************************") + message(STATUS "") +endif() diff --git a/lib/kokkos/cmake/tribits.cmake b/lib/kokkos/cmake/tribits.cmake index 0f00f1dd2e..321704a1c8 100644 --- a/lib/kokkos/cmake/tribits.cmake +++ b/lib/kokkos/cmake/tribits.cmake @@ -3,10 +3,6 @@ INCLUDE(CTest) cmake_policy(SET CMP0054 NEW) -IF(NOT DEFINED ${PROJECT_NAME}) - project(KokkosCMake) -ENDIF() - MESSAGE(WARNING "The project name is: ${PROJECT_NAME}") IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) @@ -46,26 +42,26 @@ MACRO(PREPEND_GLOBAL_SET VARNAME) GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) ENDMACRO() -FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME) - ASSERT_DEFINED(${VARNAME}) - IF (${VARNAME}) - SET(TMP ${${VARNAME}}) - LIST(REMOVE_DUPLICATES TMP) - GLOBAL_SET(${VARNAME} ${TMP}) - ENDIF() -ENDFUNCTION() +#FUNCTION(REMOVE_GLOBAL_DUPLICATES VARNAME) +# ASSERT_DEFINED(${VARNAME}) +# IF (${VARNAME}) +# SET(TMP ${${VARNAME}}) +# LIST(REMOVE_DUPLICATES TMP) +# GLOBAL_SET(${VARNAME} ${TMP}) +# ENDIF() +#ENDFUNCTION() -MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE) - MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'") - SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" ) - IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "") - IF(${USER_OPTION_NAME}) - GLOBAL_SET(${MACRO_DEFINE_NAME} ON) - ELSE() - GLOBAL_SET(${MACRO_DEFINE_NAME} OFF) - ENDIF() - ENDIF() -ENDMACRO() +#MACRO(TRIBITS_ADD_OPTION_AND_DEFINE USER_OPTION_NAME MACRO_DEFINE_NAME DOCSTRING DEFAULT_VALUE) +# MESSAGE(STATUS "TRIBITS_ADD_OPTION_AND_DEFINE: '${USER_OPTION_NAME}' '${MACRO_DEFINE_NAME}' '${DEFAULT_VALUE}'") +# SET( ${USER_OPTION_NAME} "${DEFAULT_VALUE}" CACHE BOOL "${DOCSTRING}" ) +# IF(NOT ${MACRO_DEFINE_NAME} STREQUAL "") +# IF(${USER_OPTION_NAME}) +# GLOBAL_SET(${MACRO_DEFINE_NAME} ON) +# ELSE() +# GLOBAL_SET(${MACRO_DEFINE_NAME} OFF) +# ENDIF() +# ENDIF() +#ENDMACRO() FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE) @@ -77,17 +73,20 @@ FUNCTION(TRIBITS_CONFIGURE_FILE PACKAGE_NAME_CONFIG_FILE) ENDFUNCTION() -MACRO(TRIBITS_ADD_DEBUG_OPTION) - TRIBITS_ADD_OPTION_AND_DEFINE( - ${PROJECT_NAME}_ENABLE_DEBUG - HAVE_${PROJECT_NAME_UC}_DEBUG - "Enable a host of runtime debug checking." - OFF - ) -ENDMACRO() +#MACRO(TRIBITS_ADD_DEBUG_OPTION) +# TRIBITS_ADD_OPTION_AND_DEFINE( +# ${PROJECT_NAME}_ENABLE_DEBUG +# HAVE_${PROJECT_NAME_UC}_DEBUG +# "Enable a host of runtime debug checking." +# OFF +# ) +#ENDMACRO() MACRO(TRIBITS_ADD_TEST_DIRECTORIES) + message(STATUS "ProjectName: " ${PROJECT_NAME}) + message(STATUS "Tests: " ${${PROJECT_NAME}_ENABLE_TESTS}) + IF(${${PROJECT_NAME}_ENABLE_TESTS}) FOREACH(TEST_DIR ${ARGN}) ADD_SUBDIRECTORY(${TEST_DIR}) @@ -387,17 +386,17 @@ FUNCTION(TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) ENDFUNCTION() -MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE) - GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE) - INCLUDE("${TPL_FILE}") - IF(TARGET TPL_LIB_${TPL_NAME}) - MESSAGE(STATUS "Found tpl library: ${TPL_NAME}") - SET(TPL_ENABLE_${TPL_NAME} TRUE) - ELSE() - MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}") - SET(TPL_ENABLE_${TPL_NAME} FALSE) - ENDIF() -ENDMACRO() +#MACRO(TRIBITS_PROCESS_TPL_DEP_FILE TPL_FILE) +# GET_FILENAME_COMPONENT(TPL_NAME ${TPL_FILE} NAME_WE) +# INCLUDE("${TPL_FILE}") +# IF(TARGET TPL_LIB_${TPL_NAME}) +# MESSAGE(STATUS "Found tpl library: ${TPL_NAME}") +# SET(TPL_ENABLE_${TPL_NAME} TRUE) +# ELSE() +# MESSAGE(STATUS "Tpl library not found: ${TPL_NAME}") +# SET(TPL_ENABLE_${TPL_NAME} FALSE) +# ENDIF() +#ENDMACRO() MACRO(PREPEND_TARGET_SET VARNAME TARGET_NAME TYPE) IF(TYPE STREQUAL "REQUIRED") @@ -475,6 +474,7 @@ MACRO(TRIBITS_SUBPACKAGE NAME) SET(PARENT_PACKAGE_NAME ${PACKAGE_NAME}) SET(PACKAGE_NAME ${PACKAGE_NAME}${NAME}) STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) + SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) ADD_INTERFACE_LIBRARY(PACKAGE_${PACKAGE_NAME}) @@ -494,11 +494,11 @@ MACRO(TRIBITS_PACKAGE_DECL NAME) SET(${PACKAGE_NAME}_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) STRING(TOUPPER ${PACKAGE_NAME} PACKAGE_NAME_UC) - SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps") - FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake") - FOREACH(TPL_FILE ${TPLS_FILES}) - TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE}) - ENDFOREACH() + #SET(TRIBITS_DEPS_DIR "${CMAKE_SOURCE_DIR}/cmake/deps") + #FILE(GLOB TPLS_FILES "${TRIBITS_DEPS_DIR}/*.cmake") + #FOREACH(TPL_FILE ${TPLS_FILES}) + # TRIBITS_PROCESS_TPL_DEP_FILE(${TPL_FILE}) + #ENDFOREACH() ENDMACRO() diff --git a/lib/kokkos/config/master_history.txt b/lib/kokkos/config/master_history.txt index 6f9ca897d9..06c3f95a80 100644 --- a/lib/kokkos/config/master_history.txt +++ b/lib/kokkos/config/master_history.txt @@ -10,3 +10,5 @@ tag: 2.03.05 date: 05:27:2017 master: 36b92f43 develop: 79073186 tag: 2.03.13 date: 07:27:2017 master: da314444 develop: 29ccb58a tag: 2.04.00 date: 08:16:2017 master: 54eb75c0 develop: 32fb8ee1 tag: 2.04.04 date: 09:11:2017 master: 2b7e9c20 develop: 51e7b25a +tag: 2.04.11 date: 10:28:2017 master: 54a1330a develop: ed36c017 +tag: 2.5.11 date: 12:15:2017 master: dfe685f4 develop: ec7ad6d8 diff --git a/lib/kokkos/config/nvcc_wrapper b/lib/kokkos/config/nvcc_wrapper index cb206cf88b..d339da4fcd 100755 --- a/lib/kokkos/config/nvcc_wrapper +++ b/lib/kokkos/config/nvcc_wrapper @@ -39,6 +39,12 @@ cuda_args="" # Arguments for both NVCC and Host compiler shared_args="" +# Argument -c +compile_arg="" + +# Argument -o +output_arg="" + # Linker arguments xlinker_args="" @@ -66,6 +72,7 @@ dry_run=0 # Skip NVCC compilation and use host compiler directly host_only=0 +host_only_args="" # Enable workaround for CUDA 6.5 for pragma ident replace_pragma_ident=0 @@ -78,6 +85,14 @@ temp_dir=${TMPDIR:-/tmp} # Check if we have an optimization argument already optimization_applied=0 +# Check if we have -std=c++X or --std=c++X already +stdcxx_applied=0 + +# Run nvcc a second time to generate dependencies if needed +depfile_separate=0 +depfile_output_arg="" +depfile_target_arg="" + #echo "Arguments: $# $@" while [ $# -gt 0 ] @@ -109,12 +124,31 @@ do fi ;; #Handle shared args (valid for both nvcc and the host compiler) - -D*|-c|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) + -D*|-I*|-L*|-l*|-g|--help|--version|-E|-M|-shared) shared_args="$shared_args $1" ;; - #Handle shared args that have an argument - -o|-MT) - shared_args="$shared_args $1 $2" + #Handle compilation argument + -c) + compile_arg="$1" + ;; + #Handle output argument + -o) + output_arg="$output_arg $1 $2" + shift + ;; + # Handle depfile arguments. We map them to a separate call to nvcc. + -MD|-MMD) + depfile_separate=1 + host_only_args="$host_only_args $1" + ;; + -MF) + depfile_output_arg="-o $2" + host_only_args="$host_only_args $1 $2" + shift + ;; + -MT) + depfile_target_arg="$1 $2" + host_only_args="$host_only_args $1 $2" shift ;; #Handle known nvcc args @@ -130,16 +164,25 @@ do cuda_args="$cuda_args $1 $2" shift ;; - #Handle c++11 setting - --std=c++11|-std=c++11) - shared_args="$shared_args $1" + #Handle c++11 + --std=c++11|-std=c++11|--std=c++14|-std=c++14|--std=c++1z|-std=c++1z) + if [ $stdcxx_applied -eq 1 ]; then + echo "nvcc_wrapper - *warning* you have set multiple optimization flags (-std=c++1* or --std=c++1*), only the first is used because nvcc can only accept a single std setting" + else + shared_args="$shared_args $1" + stdcxx_applied=1 + fi ;; + #strip of -std=c++98 due to nvcc warnings and Tribits will place both -std=c++11 and -std=c++98 -std=c++98|--std=c++98) ;; #strip of pedantic because it produces endless warnings about #LINE added by the preprocessor -pedantic|-Wpedantic|-ansi) ;; + #strip of -Woverloaded-virtual to avoid "cc1: warning: command line option ‘-Woverloaded-virtual’ is valid for C++/ObjC++ but not for C" + -Woverloaded-virtual) + ;; #strip -Xcompiler because we add it -Xcompiler) if [ $first_xcompiler_arg -eq 1 ]; then @@ -190,7 +233,7 @@ do object_files_xlinker="$object_files_xlinker -Xlinker $1" ;; #Handle object files which always need to use "-Xlinker": -x cu applies to all input files, so give them to linker, except if only linking - *.dylib) + @*|*.dylib) object_files="$object_files -Xlinker $1" object_files_xlinker="$object_files_xlinker -Xlinker $1" ;; @@ -230,7 +273,7 @@ if [ $first_xcompiler_arg -eq 0 ]; then fi #Compose host only command -host_command="$host_compiler $shared_args $xcompiler_args $host_linker_args $shared_versioned_libraries_host" +host_command="$host_compiler $shared_args $host_only_args $compile_arg $output_arg $xcompiler_args $host_linker_args $shared_versioned_libraries_host" #nvcc does not accept '#pragma ident SOME_MACRO_STRING' but it does accept '#ident SOME_MACRO_STRING' if [ $replace_pragma_ident -eq 1 ]; then @@ -262,10 +305,21 @@ else host_command="$host_command $object_files" fi +if [ $depfile_separate -eq 1 ]; then + # run nvcc a second time to generate dependencies (without compiling) + nvcc_depfile_command="$nvcc_command -M $depfile_target_arg $depfile_output_arg" +else + nvcc_depfile_command="" +fi + +nvcc_command="$nvcc_command $compile_arg $output_arg" + #Print command for dryrun if [ $dry_run -eq 1 ]; then if [ $host_only -eq 1 ]; then echo $host_command + elif [ -n "$nvcc_depfile_command" ]; then + echo $nvcc_command "&&" $nvcc_depfile_command else echo $nvcc_command fi @@ -275,6 +329,8 @@ fi #Run compilation command if [ $host_only -eq 1 ]; then $host_command +elif [ -n "$nvcc_depfile_command" ]; then + $nvcc_command && $nvcc_depfile_command else $nvcc_command fi diff --git a/lib/kokkos/config/test_all_sandia b/lib/kokkos/config/test_all_sandia index e6fcaad261..660ab91ff5 100755 --- a/lib/kokkos/config/test_all_sandia +++ b/lib/kokkos/config/test_all_sandia @@ -16,12 +16,12 @@ if [[ "$HOSTNAME" =~ (white|ride).* ]]; then MACHINE=white elif [[ "$HOSTNAME" =~ .*bowman.* ]]; then MACHINE=bowman -elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name +elif [[ "$HOSTNAME" =~ n.* ]]; then # Warning: very generic name if [[ "$PROCESSOR" = "aarch64" ]]; then MACHINE=sullivan - else - MACHINE=shepard fi +elif [[ "$HOSTNAME" =~ node.* ]]; then # Warning: very generic name + MACHINE=shepard elif [[ "$HOSTNAME" =~ apollo ]]; then MACHINE=apollo elif [[ "$HOSTNAME" =~ sullivan ]]; then @@ -45,7 +45,8 @@ GCC_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits IBM_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" CLANG_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" INTEL_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" -CUDA_WARNING_FLAGS="" +CUDA_WARNING_FLAGS="-Wall,-Wshadow,-pedantic,-Werror,-Wsign-compare,-Wtype-limits,-Wuninitialized" +PGI_WARNING_FLAGS="" # Default. Machine specific can override. DEBUG=False @@ -61,6 +62,8 @@ SPOT_CHECK=False PRINT_HELP=False OPT_FLAG="" +CXX_FLAGS_EXTRA="" +LD_FLAGS_EXTRA="" KOKKOS_OPTIONS="" # @@ -111,6 +114,12 @@ do --with-cuda-options*) KOKKOS_CUDA_OPTIONS="--with-cuda-options=${key#*=}" ;; + --cxxflags-extra*) + CXX_FLAGS_EXTRA="${key#*=}" + ;; + --ldflags-extra*) + LD_FLAGS_EXTRA="${key#*=}" + ;; --help*) PRINT_HELP=True ;; @@ -150,20 +159,18 @@ if [ "$MACHINE" = "sems" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" - "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" - "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" + COMPILERS=("gcc/5.3.0 $BASE_MODULE_LIST "OpenMP" g++ $GCC_WARNING_FLAGS" + "gcc/6.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" "cuda/8.0.44 $CUDA8_MODULE_LIST "Cuda_OpenMP" $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) else # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/16.0.3 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" @@ -184,6 +191,7 @@ elif [ "$MACHINE" = "white" ]; then BASE_MODULE_LIST="/" IBM_MODULE_LIST="/xl/" CUDA_MODULE_LIST="/,gcc/5.4.0" + CUDA_MODULE_LIST2="/,gcc/6.3.0,ibm/xl/13.1.6-BETA" # Don't do pthread on white. GCC_BUILD_LIST="OpenMP,Serial,OpenMP_Serial" @@ -192,6 +200,7 @@ elif [ "$MACHINE" = "white" ]; then COMPILERS=("gcc/5.4.0 $BASE_MODULE_LIST $IBM_BUILD_LIST g++ $GCC_WARNING_FLAGS" "ibm/13.1.3 $IBM_MODULE_LIST $IBM_BUILD_LIST xlC $IBM_WARNING_FLAGS" "cuda/8.0.44 $CUDA_MODULE_LIST $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" + "cuda/9.0.103 $CUDA_MODULE_LIST2 $CUDA_IBM_BUILD_LIST ${KOKKOS_PATH}/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -210,8 +219,9 @@ elif [ "$MACHINE" = "bowman" ]; then OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/16.4.258 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.2.174 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.128 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -241,13 +251,13 @@ elif [ "$MACHINE" = "shepard" ]; then SKIP_HWLOC=True export SLURM_TASKS_PER_NODE=32 - BASE_MODULE_LIST="/compilers/" - - OLD_INTEL_BUILD_LIST="Pthread,Serial,Pthread_Serial" + BASE_MODULE_LIST="/" + BASE_MODULE_LIST_INTEL="/compilers/" # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("intel/16.2.181 $BASE_MODULE_LIST $OLD_INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" - "intel/17.0.098 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + COMPILERS=("intel/17.4.196 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/18.0.128 $BASE_MODULE_LIST_INTEL $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "pgi/17.10.0 $BASE_MODULE_LIST $GCC_BUILD_LIST pgc++ $PGI_WARNING_FLAGS" ) if [ -z "$ARCH_FLAG" ]; then @@ -280,7 +290,7 @@ elif [ "$MACHINE" = "apollo" ]; then if [ "$SPOT_CHECK" = "True" ]; then # Format: (compiler module-list build-list exe-name warning-flag) - COMPILERS=("gcc/4.7.2 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" + COMPILERS=("gcc/4.8.4 $BASE_MODULE_LIST "OpenMP,Pthread" g++ $GCC_WARNING_FLAGS" "gcc/5.1.0 $BASE_MODULE_LIST "Serial" g++ $GCC_WARNING_FLAGS" "intel/16.0.1 $BASE_MODULE_LIST "OpenMP" icpc $INTEL_WARNING_FLAGS" "clang/3.9.0 $BASE_MODULE_LIST "Pthread_Serial" clang++ $CLANG_WARNING_FLAGS" @@ -292,14 +302,13 @@ elif [ "$MACHINE" = "apollo" ]; then COMPILERS=("cuda/8.0.44 $CUDA8_MODULE_LIST $BUILD_LIST_CUDA_NVCC $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" "clang/4.0.0 $CLANG_MODULE_LIST $BUILD_LIST_CUDA_CLANG clang++ $CUDA_WARNING_FLAGS" "clang/3.9.0 $CLANG_MODULE_LIST $BUILD_LIST_CLANG clang++ $CLANG_WARNING_FLAGS" - "gcc/4.7.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/4.8.4 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "gcc/4.9.2 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" + "gcc/4.9.3 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/5.3.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" "gcc/6.1.0 $BASE_MODULE_LIST $GCC_BUILD_LIST g++ $GCC_WARNING_FLAGS" - "intel/14.0.4 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/15.0.2 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "intel/16.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" + "intel/17.0.1 $BASE_MODULE_LIST $INTEL_BUILD_LIST icpc $INTEL_WARNING_FLAGS" "clang/3.5.2 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "clang/3.6.1 $BASE_MODULE_LIST $CLANG_BUILD_LIST clang++ $CLANG_WARNING_FLAGS" "cuda/7.0.28 $CUDA_MODULE_LIST $CUDA_BUILD_LIST $KOKKOS_PATH/bin/nvcc_wrapper $CUDA_WARNING_FLAGS" @@ -336,6 +345,8 @@ if [ "$PRINT_HELP" = "True" ]; then echo "--dry-run: Just print what would be executed" echo "--build-only: Just do builds, don't run anything" echo "--opt-flag=FLAG: Optimization flag (default: -O3)" + echo "--cxxflags-extra=FLAGS: Extra flags to be added to CXX_FLAGS" + echo "--ldflags-extra=FLAGS: Extra flags to be added to LD_FLAGS" echo "--arch=ARCHITECTURE: overwrite architecture flags" echo "--with-cuda-options=OPT: set KOKKOS_CUDA_OPTIONS" echo "--build-list=BUILD,BUILD,BUILD..." @@ -361,14 +372,14 @@ if [ "$PRINT_HELP" = "True" ]; then echo " Run all gcc tests" echo " % test_all_sandia gcc" echo "" - echo " Run all gcc/4.7.2 and all intel tests" - echo " % test_all_sandia gcc/4.7.2 intel" + echo " Run all gcc/4.8.4 and all intel tests" + echo " % test_all_sandia gcc/4.8.4 intel" echo "" echo " Run all tests in debug" echo " % test_all_sandia --debug" echo "" - echo " Run gcc/4.7.2 and only do OpenMP and OpenMP_Serial builds" - echo " % test_all_sandia gcc/4.7.2 --build-list=OpenMP,OpenMP_Serial" + echo " Run gcc/4.8.4 and only do OpenMP and OpenMP_Serial builds" + echo " % test_all_sandia gcc/4.8.4 --build-list=OpenMP,OpenMP_Serial" echo "" echo "If you want to kill the tests, do:" echo " hit ctrl-z" @@ -566,10 +577,15 @@ single_build_and_test() { if [[ "$build_type" = *debug* ]]; then local extra_args="$extra_args --debug" local cxxflags="-g $compiler_warning_flags" + local ldflags="-g" else local cxxflags="$OPT_FLAG $compiler_warning_flags" + local ldflags="${OPT_FLAG}" fi + local cxxflags="${cxxflags} ${CXX_FLAGS_EXTRA}" + local ldflags="${ldflags} ${LD_FLAGS_EXTRA}" + if [[ "$KOKKOS_CUDA_OPTIONS" != "" ]]; then local extra_args="$extra_args $KOKKOS_CUDA_OPTIONS" fi @@ -586,7 +602,7 @@ single_build_and_test() { run_cmd ls fake_problem >& ${desc}.configure.log || { report_and_log_test_result 1 $desc configure && return 0; } fi else - run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } + run_cmd ${KOKKOS_PATH}/generate_makefile.bash --with-devices=$build $ARCH_FLAG --compiler=$(which $compiler_exe) --cxxflags=\"$cxxflags\" --ldflags=\"$ldflags\" $extra_args &>> ${desc}.configure.log || { report_and_log_test_result 1 ${desc} configure && return 0; } local -i build_start_time=$(date +%s) run_cmd make -j 32 build-test >& ${desc}.build.log || { report_and_log_test_result 1 ${desc} build && return 0; } local -i build_end_time=$(date +%s) diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel index 6527df2eb9..f122a1b36f 100755 --- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel +++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_pthread_intel @@ -1,6 +1,6 @@ #!/bin/bash -el ulimit -c 0 -module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none +module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none KOKKOS_BRANCH=$1 TRILINOS_UPDATE_BRANCH=$2 diff --git a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel index 1a306bc2b2..f9b2200a43 100755 --- a/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel +++ b/lib/kokkos/config/trilinos-integration/shepard_jenkins_run_script_serial_intel @@ -1,6 +1,6 @@ #!/bin/bash -el ulimit -c 0 -module load devpack/openmpi/1.10.0/intel/16.1.056/cuda/none +module load devpack/openmpi/2.1.1/intel/17.4.196/cuda/none KOKKOS_BRANCH=$1 TRILINOS_UPDATE_BRANCH=$2 diff --git a/lib/kokkos/containers/CMakeLists.txt b/lib/kokkos/containers/CMakeLists.txt index 894935fa01..c37aa3e3e2 100644 --- a/lib/kokkos/containers/CMakeLists.txt +++ b/lib/kokkos/containers/CMakeLists.txt @@ -2,7 +2,10 @@ TRIBITS_SUBPACKAGE(Containers) -ADD_SUBDIRECTORY(src) + +IF(KOKKOS_HAS_TRILINOS) + ADD_SUBDIRECTORY(src) +ENDIF() TRIBITS_ADD_TEST_DIRECTORIES(unit_tests) TRIBITS_ADD_TEST_DIRECTORIES(performance_tests) diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index 403ac746f6..1203a8bd81 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -3,6 +3,14 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +IF(NOT KOKKOS_HAS_TRILINOS) + IF(KOKKOS_SEPARATE_LIBS) + set(TEST_LINK_TARGETS kokkoscore) + ELSE() + set(TEST_LINK_TARGETS kokkos) + ENDIF() +ENDIF() + SET(SOURCES TestMain.cpp TestCuda.cpp @@ -24,7 +32,7 @@ TRIBITS_ADD_EXECUTABLE( PerfTestExec SOURCES ${SOURCES} COMM serial mpi - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) TRIBITS_ADD_TEST( diff --git a/lib/kokkos/containers/performance_tests/Makefile b/lib/kokkos/containers/performance_tests/Makefile index ec69363a17..ebed75ccd6 100644 --- a/lib/kokkos/containers/performance_tests/Makefile +++ b/lib/kokkos/containers/performance_tests/Makefile @@ -15,7 +15,8 @@ endif CXXFLAGS = -O3 LINK ?= $(CXX) -LDFLAGS ?= -lpthread +LDFLAGS ?= +override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos @@ -30,6 +31,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o TestMain.o gtest-all.o + TARGETS += KokkosContainers_PerformanceTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o TestMain.o gtest-all.o TARGETS += KokkosContainers_PerformanceTest_Threads @@ -45,6 +52,9 @@ endif KokkosContainers_PerformanceTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Cuda +KokkosContainers_PerformanceTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_ROCm + KokkosContainers_PerformanceTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) -o KokkosContainers_PerformanceTest_Threads @@ -54,6 +64,9 @@ KokkosContainers_PerformanceTest_OpenMP: $(OBJ_OPENMP) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosContainers_PerformanceTest_Cuda ./KokkosContainers_PerformanceTest_Cuda +test-rocm: KokkosContainers_PerformanceTest_ROCm + ./KokkosContainers_PerformanceTest_ROCm + test-threads: KokkosContainers_PerformanceTest_Threads ./KokkosContainers_PerformanceTest_Threads diff --git a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp index 4c0ccb6b88..ced74c6f51 100644 --- a/lib/kokkos/containers/performance_tests/TestDynRankView.hpp +++ b/lib/kokkos/containers/performance_tests/TestDynRankView.hpp @@ -180,8 +180,8 @@ void test_dynrankview_op_perf( const int par_size ) typedef DeviceType execution_space; typedef typename execution_space::size_type size_type; - const size_type dim2 = 90; - const size_type dim3 = 30; + const size_type dim_2 = 90; + const size_type dim_3 = 30; double elapsed_time_view = 0; double elapsed_time_compview = 0; @@ -191,7 +191,7 @@ void test_dynrankview_op_perf( const int par_size ) double elapsed_time_compdrview = 0; Kokkos::Timer timer; { - Kokkos::View testview("testview",par_size,dim2,dim3); + Kokkos::View testview("testview",par_size,dim_2,dim_3); typedef InitViewFunctor FunctorType; timer.reset(); @@ -220,7 +220,7 @@ void test_dynrankview_op_perf( const int par_size ) std::cout << " Strided View time (init only): " << elapsed_time_strideview << std::endl; } { - Kokkos::View testview("testview",par_size,dim2,dim3,1,1,1,1); + Kokkos::View testview("testview",par_size,dim_2,dim_3,1,1,1,1); typedef InitViewRank7Functor FunctorType; timer.reset(); @@ -231,7 +231,7 @@ void test_dynrankview_op_perf( const int par_size ) std::cout << " View Rank7 time (init only): " << elapsed_time_view_rank7 << std::endl; } { - Kokkos::DynRankView testdrview("testdrview",par_size,dim2,dim3); + Kokkos::DynRankView testdrview("testdrview",par_size,dim_2,dim_3); typedef InitDynRankViewFunctor FunctorType; timer.reset(); diff --git a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp index 6631184624..012f45bab7 100644 --- a/lib/kokkos/containers/performance_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/performance_tests/TestOpenMP.cpp @@ -54,6 +54,7 @@ #include #include +#include #include #include @@ -122,6 +123,18 @@ TEST_F( openmp, unordered_map_performance_far) Perf::run_performance_tests(base_file_name.str()); } +TEST_F( openmp, scatter_view) +{ + std::cout << "ScatterView data-duplicated test:\n"; + Perf::test_scatter_view(10, 1000 * 1000); +//std::cout << "ScatterView atomics test:\n"; +//Perf::test_scatter_view(10, 1000 * 1000); +} + } // namespace test #else void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTOPENMP_PREVENT_EMPTY_LINK_ERROR() {} diff --git a/lib/kokkos/containers/performance_tests/TestROCm.cpp b/lib/kokkos/containers/performance_tests/TestROCm.cpp new file mode 100644 index 0000000000..6647d23065 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestROCm.cpp @@ -0,0 +1,113 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#if defined( KOKKOS_ENABLE_ROCM ) + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +#include + +#include + +#include + +namespace Performance { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; +#if 0 +// issue 1089 +TEST_F( rocm, dynrankview_perf ) +{ + std::cout << "ROCm" << std::endl; + std::cout << " DynRankView vs View: Initialization Only " << std::endl; + test_dynrankview_op_perf( 40960 ); +} + +TEST_F( rocm, global_2_local) +{ + std::cout << "ROCm" << std::endl; + std::cout << "size, create, generate, fill, find" << std::endl; + for (unsigned i=Performance::begin_id_size; i<=Performance::end_id_size; i *= Performance::id_step) + test_global_to_local_ids(i); +} + +#endif +TEST_F( rocm, unordered_map_performance_near) +{ + Perf::run_performance_tests("rocm-near"); +} + +TEST_F( rocm, unordered_map_performance_far) +{ + Perf::run_performance_tests("rocm-far"); +} + +} +#else +void KOKKOS_CONTAINERS_PERFORMANCE_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {} +#endif /* #if defined( KOKKOS_ENABLE_ROCM ) */ diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp new file mode 100644 index 0000000000..4fd69173c0 --- /dev/null +++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp @@ -0,0 +1,113 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP +#define KOKKOS_TEST_SCATTER_VIEW_HPP + +#include +#include + +namespace Perf { + +template +void test_scatter_view(int m, int n) +{ + Kokkos::View original_view("original_view", n); + { + auto scatter_view = Kokkos::Experimental::create_scatter_view + < Kokkos::Experimental::ScatterSum + , duplication + , contribution + > (original_view); + Kokkos::Experimental::UniqueToken< + ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> + unique_token{ExecSpace()}; + //auto internal_view = scatter_view.internal_view; + auto policy = Kokkos::RangePolicy(0, n); + for (int foo = 0; foo < 5; ++foo) { + { + auto num_threads = unique_token.size(); + std::cout << "num_threads " << num_threads << '\n'; + Kokkos::View hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + auto f2 = KOKKOS_LAMBDA(int i) { + auto thread_id = unique_token.acquire(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % n; + hand_coded_duplicate_view(thread_id, k, 0) += 4.2; + hand_coded_duplicate_view(thread_id, k, 1) += 2.0; + hand_coded_duplicate_view(thread_id, k, 2) += 1.0; + } + }; + Kokkos::Timer timer; + timer.reset(); + for (int k = 0; k < m; ++k) { + Kokkos::parallel_for(policy, f2, "hand_coded_duplicate_scatter_view_test"); + } + auto t = timer.seconds(); + std::cout << "hand-coded test took " << t << " seconds\n"; + } + { + auto f = KOKKOS_LAMBDA(int i) { + auto scatter_access = scatter_view.access(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % n; + scatter_access(k, 0) += 4.2; + scatter_access(k, 1) += 2.0; + scatter_access(k, 2) += 1.0; + } + }; + Kokkos::Timer timer; + timer.reset(); + for (int k = 0; k < m; ++k) { + Kokkos::parallel_for(policy, f, "scatter_view_test"); + } + auto t = timer.seconds(); + std::cout << "test took " << t << " seconds\n"; + } + } + } +} + +} + +#endif diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index da5a791530..e68fcad5e9 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -6,26 +6,42 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(HEADERS "") -SET(SOURCES "") - -SET(HEADERS_IMPL "") - -FILE(GLOB HEADERS *.hpp) -FILE(GLOB HEADERS_IMPL impl/*.hpp) -FILE(GLOB SOURCES impl/*.cpp) - SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) -INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/) +if(KOKKOS_LEGACY_TRIBITS) -TRIBITS_ADD_LIBRARY( - kokkoscontainers - HEADERS ${HEADERS} - NOINSTALLHEADERS ${HEADERS_IMPL} - SOURCES ${SOURCES} - DEPLIBS - ) + SET(HEADERS "") + SET(SOURCES "") + SET(HEADERS_IMPL "") + + FILE(GLOB HEADERS *.hpp) + FILE(GLOB HEADERS_IMPL impl/*.hpp) + FILE(GLOB SOURCES impl/*.cpp) + + INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/) + + TRIBITS_ADD_LIBRARY( + kokkoscontainers + HEADERS ${HEADERS} + NOINSTALLHEADERS ${HEADERS_IMPL} + SOURCES ${SOURCES} + DEPLIBS + ) + +else() + + INSTALL ( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION ${TRILINOS_INCDIR} + FILES_MATCHING PATTERN "*.hpp" + ) + + TRIBITS_ADD_LIBRARY( + kokkoscontainers + SOURCES ${KOKKOS_CONTAINERS_SRCS} + DEPLIBS + ) + +endif() #----------------------------------------------------------------------------- - diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index e9059d64c4..86275ce7c9 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -72,8 +72,10 @@ private: , "DynamicView must be rank-one" ); static_assert( std::is_trivial< typename traits::value_type >::value && - std::is_same< typename traits::specialize , void >::value - , "DynamicView must have trivial data type" ); + std::is_same< typename traits::specialize , void >::value && + Kokkos::Impl::is_power_of_two + ::value + , "DynamicView must have trivial value_type and sizeof(value_type) is a power-of-two"); template< class Space , bool = Kokkos::Impl::MemorySpaceAccess< Space , typename traits::memory_space >::accessible > struct verify_space diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp new file mode 100644 index 0000000000..48c4709480 --- /dev/null +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -0,0 +1,999 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + + +/// \file Kokkos_ScatterView.hpp +/// \brief Declaration and definition of Kokkos::ScatterView. +/// +/// This header file declares and defines Kokkos::ScatterView and its +/// related nonmember functions. + +#ifndef KOKKOS_SCATTER_VIEW_HPP +#define KOKKOS_SCATTER_VIEW_HPP + +#include +#include + +namespace Kokkos { +namespace Experimental { + +//TODO: replace this enum with the Kokkos::Sum, etc reducers for parallel_reduce +enum : int { + ScatterSum, +}; + +enum : int { + ScatterNonDuplicated = 0, + ScatterDuplicated = 1 +}; + +enum : int { + ScatterNonAtomic = 0, + ScatterAtomic = 1 +}; + +}} // Kokkos::Experimental + +namespace Kokkos { +namespace Impl { +namespace Experimental { + +template +struct DefaultDuplication; + +template +struct DefaultContribution; + +#ifdef KOKKOS_ENABLE_SERIAL +template <> +struct DefaultDuplication { + enum : int { value = Kokkos::Experimental::ScatterNonDuplicated }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterNonAtomic }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterNonAtomic }; +}; +#endif + +#ifdef KOKKOS_ENABLE_OPENMP +template <> +struct DefaultDuplication { + enum : int { value = Kokkos::Experimental::ScatterDuplicated }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterAtomic }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterNonAtomic }; +}; +#endif + +#ifdef KOKKOS_ENABLE_THREADS +template <> +struct DefaultDuplication { + enum : int { value = Kokkos::Experimental::ScatterDuplicated }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterAtomic }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterNonAtomic }; +}; +#endif + +#ifdef KOKKOS_ENABLE_CUDA +template <> +struct DefaultDuplication { + enum : int { value = Kokkos::Experimental::ScatterNonDuplicated }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterAtomic }; +}; +template <> +struct DefaultContribution { + enum : int { value = Kokkos::Experimental::ScatterAtomic }; +}; +#endif + +/* ScatterValue is the object returned by the access operator() of ScatterAccess, + similar to that returned by an Atomic View, it wraps Kokkos::atomic_add with convenient + operator+=, etc. */ +template +struct ScatterValue; + +template +struct ScatterValue { + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {} + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ScatterValue&& other) : value( other.value ) {} + KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) { + value += rhs; + } + KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { + value -= rhs; + } + private: + ValueType& value; +}; + +template +struct ScatterValue { + public: + KOKKOS_FORCEINLINE_FUNCTION ScatterValue(ValueType& value_in) : value( value_in ) {} + KOKKOS_FORCEINLINE_FUNCTION void operator+=(ValueType const& rhs) { + Kokkos::atomic_add(&value, rhs); + } + KOKKOS_FORCEINLINE_FUNCTION void operator-=(ValueType const& rhs) { + Kokkos::atomic_add(&value, -rhs); + } + private: + ValueType& value; +}; + +/* DuplicatedDataType, given a View DataType, will create a new DataType + that has a new runtime dimension which becomes the largest-stride dimension. + In the case of LayoutLeft, due to the limitation induced by the design of DataType + itself, it must convert any existing compile-time dimensions into runtime dimensions. */ +template +struct DuplicatedDataType; + +template +struct DuplicatedDataType { + typedef T* value_type; // For LayoutRight, add a star all the way on the left +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type value_type[N]; +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type value_type[]; +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type* value_type; +}; + +template +struct DuplicatedDataType { + typedef T* value_type; +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type* value_type; +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type* value_type; +}; + +template +struct DuplicatedDataType { + typedef typename DuplicatedDataType::value_type* value_type; +}; + +/* Slice is just responsible for stuffing the correct number of Kokkos::ALL + arguments on the correct side of the index in a call to subview() to get a + subview where the index specified is the largest-stride one. */ +template +struct Slice { + typedef Slice next; + typedef typename next::value_type value_type; + + static + value_type get(V const& src, const size_t i, Args ... args) { + return next::get(src, i, Kokkos::ALL, args...); + } +}; + +template +struct Slice { + typedef typename Kokkos::Impl::ViewMapping + < void + , V + , const size_t + , Args ... + >::type value_type; + static + value_type get(V const& src, const size_t i, Args ... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + typedef typename Kokkos::Impl::ViewMapping + < void + , V + , Args ... + , const size_t + >::type value_type; + static + value_type get(V const& src, const size_t i, Args ... args) { + return Kokkos::subview(src, args..., i); + } +}; + +template +struct ReduceDuplicates; + +template +struct ReduceDuplicatesBase { + typedef ReduceDuplicates Derived; + ValueType const* src; + ValueType* dst; + size_t stride; + size_t start; + size_t n; + ReduceDuplicatesBase(ValueType const* src_in, ValueType* dest_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name) + : src(src_in) + , dst(dest_in) + , stride(stride_in) + , start(start_in) + , n(n_in) + { +#if defined(KOKKOS_ENABLE_PROFILING) + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, &kpID); + } +#endif + typedef RangePolicy policy_type; + typedef Kokkos::Impl::ParallelFor closure_type; + const closure_type closure(*(static_cast(this)), policy_type(0, stride)); + closure.execute(); +#if defined(KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } +#endif + } +}; + +template +struct ReduceDuplicates : + public ReduceDuplicatesBase +{ + typedef ReduceDuplicatesBase Base; + ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, size_t start_in, size_t n_in, std::string const& name): + Base(src_in, dst_in, stride_in, start_in, n_in, name) + {} + KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { + for (size_t j = Base::start; j < Base::n; ++j) { + Base::dst[i] += Base::src[i + Base::stride * j]; + } + } +}; + +template +struct ResetDuplicates; + +template +struct ResetDuplicatesBase { + typedef ResetDuplicates Derived; + ValueType* data; + ResetDuplicatesBase(ValueType* data_in, size_t size_in, std::string const& name) + : data(data_in) + { +#if defined(KOKKOS_ENABLE_PROFILING) + uint64_t kpID = 0; + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::beginParallelFor(std::string("reduce_") + name, 0, &kpID); + } +#endif + typedef RangePolicy policy_type; + typedef Kokkos::Impl::ParallelFor closure_type; + const closure_type closure(*(static_cast(this)), policy_type(0, size_in)); + closure.execute(); +#if defined(KOKKOS_ENABLE_PROFILING) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } +#endif + } +}; + +template +struct ResetDuplicates : + public ResetDuplicatesBase +{ + typedef ResetDuplicatesBase Base; + ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name): + Base(data_in, size_in, name) + {} + KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { + Base::data[i] = Kokkos::reduction_identity::sum(); + } +}; + +}}} // Kokkos::Impl::Experimental + +namespace Kokkos { +namespace Experimental { + +template ::value + ,int contribution = Kokkos::Impl::Experimental::DefaultContribution::value + > +class ScatterView; + +template +class ScatterAccess; + +// non-duplicated implementation +template +class ScatterView +{ +public: + typedef Kokkos::View original_view_type; + typedef typename original_view_type::value_type original_value_type; + typedef typename original_view_type::reference_type original_reference_type; + friend class ScatterAccess; + friend class ScatterAccess; + + ScatterView() + { + } + + template + ScatterView(View const& original_view) + : internal_view(original_view) + { + } + + template + ScatterView(std::string const& name, Dims ... dims) + : internal_view(name, dims ...) + { + } + + template + KOKKOS_FORCEINLINE_FUNCTION + ScatterAccess + access() const { + return ScatterAccess{*this}; + } + + original_view_type subview() const { + return internal_view; + } + + template + void contribute_into(View const& dest) const + { + typedef View dest_type; + static_assert(std::is_same< + typename dest_type::array_layout, + Layout>::value, + "ScatterView contribute destination has different layout"); + static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + typename ExecSpace::memory_space, + typename dest_type::memory_space>::value, + "ScatterView contribute destination memory space not accessible"); + if (dest.data() == internal_view.data()) return; + Kokkos::Impl::Experimental::ReduceDuplicates( + internal_view.data(), + dest.data(), + 0, + 0, + 1, + internal_view.label()); + } + + void reset() { + Kokkos::Impl::Experimental::ResetDuplicates( + internal_view.data(), + internal_view.size(), + internal_view.label()); + } + template + void reset_except(View const& view) { + if (view.data() != internal_view.data()) reset(); + } + + void resize(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0, + const size_t n7 = 0) { + ::Kokkos::resize(internal_view,n0,n1,n2,n3,n4,n5,n6,n7); + } + + void realloc(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0, + const size_t n7 = 0) { + ::Kokkos::realloc(internal_view,n0,n1,n2,n3,n4,n5,n6,n7); + } + +protected: + template + KOKKOS_FORCEINLINE_FUNCTION + original_reference_type at(Args ... args) const { + return internal_view(args...); + } +private: + typedef original_view_type internal_view_type; + internal_view_type internal_view; +}; + +template +class ScatterAccess +{ +public: + typedef ScatterView view_type; + typedef typename view_type::original_value_type original_value_type; + typedef Kokkos::Impl::Experimental::ScatterValue< + original_value_type, Op, override_contribution> value_type; + + KOKKOS_INLINE_FUNCTION + ScatterAccess(view_type const& view_in) + : view(view_in) + { + } + + template + KOKKOS_FORCEINLINE_FUNCTION + value_type operator()(Args ... args) const { + return view.at(args...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if::value, value_type>::type + operator[](Arg arg) const { + return view.at(arg); + } + +private: + view_type const& view; +}; + +// duplicated implementation +// LayoutLeft and LayoutRight are different enough that we'll just specialize each + +template +class ScatterView +{ +public: + typedef Kokkos::View original_view_type; + typedef typename original_view_type::value_type original_value_type; + typedef typename original_view_type::reference_type original_reference_type; + friend class ScatterAccess; + friend class ScatterAccess; + typedef typename Kokkos::Impl::Experimental::DuplicatedDataType data_type_info; + typedef typename data_type_info::value_type internal_data_type; + typedef Kokkos::View internal_view_type; + + ScatterView() + { + } + + template + ScatterView(View const& original_view) + : unique_token() + , internal_view(Kokkos::ViewAllocateWithoutInitializing( + std::string("duplicated_") + original_view.label()), + unique_token.size(), + original_view.extent(0), + original_view.extent(1), + original_view.extent(2), + original_view.extent(3), + original_view.extent(4), + original_view.extent(5), + original_view.extent(6)) + { + reset(); + } + + template + ScatterView(std::string const& name, Dims ... dims) + : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), unique_token.size(), dims ...) + { + reset(); + } + + template + inline + ScatterAccess + access() const { + return ScatterAccess{*this}; + } + + typename Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::value_type + subview() const + { + return Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutRight, internal_view_type::Rank, internal_view_type>::get(internal_view, 0); + } + + template + void contribute_into(View const& dest) const + { + typedef View dest_type; + static_assert(std::is_same< + typename dest_type::array_layout, + Kokkos::LayoutRight>::value, + "ScatterView deep_copy destination has different layout"); + static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + typename ExecSpace::memory_space, + typename dest_type::memory_space>::value, + "ScatterView deep_copy destination memory space not accessible"); + size_t strides[8]; + internal_view.stride(strides); + bool is_equal = (dest.data() == internal_view.data()); + size_t start = is_equal ? 1 : 0; + Kokkos::Impl::Experimental::ReduceDuplicates( + internal_view.data(), + dest.data(), + strides[0], + start, + internal_view.extent(0), + internal_view.label()); + } + + void reset() { + Kokkos::Impl::Experimental::ResetDuplicates( + internal_view.data(), + internal_view.size(), + internal_view.label()); + } + template + void reset_except(View const& view) { + if (view.data() != internal_view.data()) { + reset(); + return; + } + Kokkos::Impl::Experimental::ResetDuplicates( + internal_view.data() + view.size(), + internal_view.size() - view.size(), + internal_view.label()); + } + + void resize(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0) { + ::Kokkos::resize(internal_view,unique_token.size(),n0,n1,n2,n3,n4,n5,n6); + } + + void realloc(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0) { + ::Kokkos::realloc(internal_view,unique_token.size(),n0,n1,n2,n3,n4,n5,n6); + } + +protected: + template + KOKKOS_FORCEINLINE_FUNCTION + original_reference_type at(int rank, Args ... args) const { + return internal_view(rank, args...); + } + +protected: + typedef Kokkos::Experimental::UniqueToken< + ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_type; + + unique_token_type unique_token; + internal_view_type internal_view; +}; + +template +class ScatterView +{ +public: + typedef Kokkos::View original_view_type; + typedef typename original_view_type::value_type original_value_type; + typedef typename original_view_type::reference_type original_reference_type; + friend class ScatterAccess; + friend class ScatterAccess; + typedef typename Kokkos::Impl::Experimental::DuplicatedDataType data_type_info; + typedef typename data_type_info::value_type internal_data_type; + typedef Kokkos::View internal_view_type; + + ScatterView() + { + } + + template + ScatterView(View const& original_view) + : unique_token() + { + size_t arg_N[8] = { + original_view.extent(0), + original_view.extent(1), + original_view.extent(2), + original_view.extent(3), + original_view.extent(4), + original_view.extent(5), + original_view.extent(6), + 0 + }; + arg_N[internal_view_type::rank - 1] = unique_token.size(); + internal_view = internal_view_type( + Kokkos::ViewAllocateWithoutInitializing( + std::string("duplicated_") + original_view.label()), + arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + reset(); + } + + template + ScatterView(std::string const& name, Dims ... dims) + : internal_view(Kokkos::ViewAllocateWithoutInitializing(name), dims ..., unique_token.size()) + { + reset(); + } + + template + inline + ScatterAccess + access() const { + return ScatterAccess{*this}; + } + + typename Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::value_type + subview() const + { + return Kokkos::Impl::Experimental::Slice< + Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); + } + + template + void contribute_into(View const& dest) const + { + typedef View dest_type; + static_assert(std::is_same< + typename dest_type::array_layout, + Kokkos::LayoutLeft>::value, + "ScatterView deep_copy destination has different layout"); + static_assert(Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< + typename ExecSpace::memory_space, + typename dest_type::memory_space>::value, + "ScatterView deep_copy destination memory space not accessible"); + size_t strides[8]; + internal_view.stride(strides); + size_t stride = strides[internal_view_type::rank - 1]; + auto extent = internal_view.extent( + internal_view_type::rank - 1); + bool is_equal = (dest.data() == internal_view.data()); + size_t start = is_equal ? 1 : 0; + Kokkos::Impl::Experimental::ReduceDuplicates( + internal_view.data(), + dest.data(), + stride, + start, + extent, + internal_view.label()); + } + + void reset() { + Kokkos::Impl::Experimental::ResetDuplicates( + internal_view.data(), + internal_view.size(), + internal_view.label()); + } + template + void reset_except(View const& view) { + if (view.data() != internal_view.data()) { + reset(); + return; + } + Kokkos::Impl::Experimental::ResetDuplicates( + internal_view.data() + view.size(), + internal_view.size() - view.size(), + internal_view.label()); + } + + void resize(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0) { + + size_t arg_N[8] = {n0,n1,n2,n3,n4,n5,n6,0}; + const int i = internal_view.rank-1; + arg_N[i] = unique_token.size(); + + ::Kokkos::resize(internal_view, + arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + } + + void realloc(const size_t n0 = 0, + const size_t n1 = 0, + const size_t n2 = 0, + const size_t n3 = 0, + const size_t n4 = 0, + const size_t n5 = 0, + const size_t n6 = 0) { + + size_t arg_N[8] = {n0,n1,n2,n3,n4,n5,n6,0}; + const int i = internal_view.rank-1; + arg_N[i] = unique_token.size(); + + ::Kokkos::realloc(internal_view, + arg_N[0], arg_N[1], arg_N[2], arg_N[3], + arg_N[4], arg_N[5], arg_N[6], arg_N[7]); + } + +protected: + template + inline original_reference_type at(int thread_id, Args ... args) const { + return internal_view(args..., thread_id); + } + +protected: + typedef Kokkos::Experimental::UniqueToken< + ExecSpace, Kokkos::Experimental::UniqueTokenScope::Global> unique_token_type; + + unique_token_type unique_token; + internal_view_type internal_view; +}; + + +/* This object has to be separate in order to store the thread ID, which cannot + be obtained until one is inside a parallel construct, and may be relatively + expensive to obtain at every contribution + (calls a non-inlined function, looks up a thread-local variable). + Due to the expense, it is sensible to query it at most once per parallel iterate + (ideally once per thread, but parallel_for doesn't expose that) + and then store it in a stack variable. + ScatterAccess serves as a non-const object on the stack which can store the thread ID */ + +template +class ScatterAccess +{ +public: + typedef ScatterView view_type; + typedef typename view_type::original_value_type original_value_type; + typedef Kokkos::Impl::Experimental::ScatterValue< + original_value_type, Op, override_contribution> value_type; + + inline ScatterAccess(view_type const& view_in) + : view(view_in) + , thread_id(view_in.unique_token.acquire()) { + } + + inline ~ScatterAccess() { + if (thread_id != ~thread_id_type(0)) view.unique_token.release(thread_id); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + value_type operator()(Args ... args) const { + return view.at(thread_id, args...); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + typename std::enable_if::value, value_type>::type + operator[](Arg arg) const { + return view.at(thread_id, arg); + } + +private: + + view_type const& view; + + // simplify RAII by disallowing copies + ScatterAccess(ScatterAccess const& other) = delete; + ScatterAccess& operator=(ScatterAccess const& other) = delete; + ScatterAccess& operator=(ScatterAccess&& other) = delete; + +public: + // do need to allow moves though, for the common + // auto b = a.access(); + // that assignments turns into a move constructor call + inline ScatterAccess(ScatterAccess&& other) + : view(other.view) + , thread_id(other.thread_id) + { + other.thread_id = ~thread_id_type(0); + } + +private: + + typedef typename view_type::unique_token_type unique_token_type; + typedef typename unique_token_type::size_type thread_id_type; + thread_id_type thread_id; +}; + +template +ScatterView + < RT + , typename ViewTraits::array_layout + , typename ViewTraits::execution_space + , Op + /* just setting defaults if not specified... things got messy because the view type + does not come before the duplication/contribution settings in the + template parameter list */ + , duplication == -1 ? Kokkos::Impl::Experimental::DefaultDuplication::execution_space>::value : duplication + , contribution == -1 ? + Kokkos::Impl::Experimental::DefaultContribution< + typename ViewTraits::execution_space, + (duplication == -1 ? + Kokkos::Impl::Experimental::DefaultDuplication< + typename ViewTraits::execution_space + >::value + : duplication + ) + >::value + : contribution + > +create_scatter_view(View const& original_view) { + return original_view; // implicit ScatterView constructor call +} + +}} // namespace Kokkos::Experimental + +namespace Kokkos { +namespace Experimental { + +template +void +contribute(View& dest, Kokkos::Experimental::ScatterView const& src) +{ + src.contribute_into(dest); +} + +}} // namespace Kokkos::Experimental + +namespace Kokkos { + +template +void +realloc(Kokkos::Experimental::ScatterView& scatter_view, IS ... is) +{ + scatter_view.realloc(is ...); +} + +template +void +resize(Kokkos::Experimental::ScatterView& scatter_view, IS ... is) +{ + scatter_view.resize(is ...); +} + +} // namespace Kokkos + +#endif diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index 63520daa6b..059587a67c 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -517,7 +517,7 @@ public: size_type find_attempts = 0; - enum { bounded_find_attempts = 32u }; + enum : unsigned { bounded_find_attempts = 32u }; const size_type max_attempts = (m_bounded_insert && (bounded_find_attempts < m_available_indexes.max_hint()) ) ? bounded_find_attempts : m_available_indexes.max_hint(); diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp index 91fecd6151..03bbefab10 100644 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -56,11 +56,12 @@ template< class Scalar, class Arg1Type = void> class vector : public DualView { +public: typedef Scalar value_type; typedef Scalar* pointer; typedef const Scalar* const_pointer; - typedef Scalar* reference; - typedef const Scalar* const_reference; + typedef Scalar& reference; + typedef const Scalar& const_reference; typedef Scalar* iterator; typedef const Scalar* const_iterator; @@ -73,11 +74,11 @@ private: public: #ifdef KOKKOS_ENABLE_CUDA_UVM - KOKKOS_INLINE_FUNCTION Scalar& operator() (int i) const {return DV::h_view(i);}; - KOKKOS_INLINE_FUNCTION Scalar& operator[] (int i) const {return DV::h_view(i);}; + KOKKOS_INLINE_FUNCTION reference operator() (int i) const {return DV::h_view(i);}; + KOKKOS_INLINE_FUNCTION reference operator[] (int i) const {return DV::h_view(i);}; #else - inline Scalar& operator() (int i) const {return DV::h_view(i);}; - inline Scalar& operator[] (int i) const {return DV::h_view(i);}; + inline reference operator() (int i) const {return DV::h_view(i);}; + inline reference operator[] (int i) const {return DV::h_view(i);}; #endif /* Member functions which behave like std::vector functions */ @@ -86,7 +87,7 @@ public: _size = 0; _extra_storage = 1.1; DV::modified_host() = 1; - }; + } vector(int n, Scalar val=Scalar()):DualView("Vector",size_t(n*(1.1))) { @@ -146,25 +147,32 @@ public: DV::h_view(_size) = val; _size++; - }; + } void pop_back() { _size--; - }; + } void clear() { _size = 0; } - size_type size() const {return _size;}; + size_type size() const {return _size;} size_type max_size() const {return 2000000000;} - size_type capacity() const {return DV::capacity();}; - bool empty() const {return _size==0;}; + size_type capacity() const {return DV::capacity();} + bool empty() const {return _size==0;} - iterator begin() const {return &DV::h_view(0);}; + iterator begin() const {return &DV::h_view(0);} - iterator end() const {return &DV::h_view(_size);}; + iterator end() const {return &DV::h_view(_size);} + reference front() {return DV::h_view(0);} + + reference back() {return DV::h_view(_size - 1);} + + const_reference front() const {return DV::h_view(0);} + + const_reference back() const {return DV::h_view(_size - 1);} /* std::algorithms wich work originally with iterators, here they are implemented as member functions */ diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index 0c59c616d6..1162d2a6ba 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -3,7 +3,13 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -SET(LIBRARIES kokkoscore) +IF(NOT KOKKOS_HAS_TRILINOS) + IF(KOKKOS_SEPARATE_LIBS) + set(TEST_LINK_TARGETS kokkoscore) + ELSE() + set(TEST_LINK_TARGETS kokkos) + ENDIF() +ENDIF() IF(Kokkos_ENABLE_Pthread) TRIBITS_ADD_EXECUTABLE_AND_TEST( @@ -12,7 +18,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -23,7 +29,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -34,7 +40,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -45,7 +51,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index 52559935d4..2b6861f6d7 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -15,7 +15,8 @@ endif CXXFLAGS = -O3 LINK ?= $(CXX) -LDFLAGS ?= -lpthread +LDFLAGS ?= +override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos @@ -30,6 +31,12 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) TEST_TARGETS += test-cuda endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_ROCM = TestROCm.o UnitTestMain.o gtest-all.o + TARGETS += KokkosContainers_UnitTest_ROCm + TEST_TARGETS += test-rocm +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_THREADS = TestThreads.o UnitTestMain.o gtest-all.o TARGETS += KokkosContainers_UnitTest_Threads @@ -51,6 +58,9 @@ endif KokkosContainers_UnitTest_Cuda: $(OBJ_CUDA) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_CUDA) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Cuda +KokkosContainers_UnitTest_ROCm: $(OBJ_ROCM) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(EXTRA_PATH) $(OBJ_ROCM) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_ROCm + KokkosContainers_UnitTest_Threads: $(OBJ_THREADS) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_THREADS) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosContainers_UnitTest_Threads @@ -63,6 +73,9 @@ KokkosContainers_UnitTest_Serial: $(OBJ_SERIAL) $(KOKKOS_LINK_DEPENDS) test-cuda: KokkosContainers_UnitTest_Cuda ./KokkosContainers_UnitTest_Cuda +test-rocm: KokkosContainers_UnitTest_ROCm + ./KokkosContainers_UnitTest_ROCm + test-threads: KokkosContainers_UnitTest_Threads ./KokkosContainers_UnitTest_Threads diff --git a/lib/kokkos/containers/unit_tests/TestCuda.cpp b/lib/kokkos/containers/unit_tests/TestCuda.cpp index 651a4e7eb8..ddd6bdae6d 100644 --- a/lib/kokkos/containers/unit_tests/TestCuda.cpp +++ b/lib/kokkos/containers/unit_tests/TestCuda.cpp @@ -62,6 +62,7 @@ #include #include #include +#include #include #include @@ -201,10 +202,18 @@ void cuda_test_bitset() cuda_test_dualview_combinations(size); \ } +#define CUDA_SCATTERVIEW_TEST( size ) \ + TEST_F( cuda, scatterview_##size##x) { \ + test_scatter_view(size); \ + } + CUDA_DUALVIEW_COMBINE_TEST( 10 ) CUDA_VECTOR_COMBINE_TEST( 10 ) CUDA_VECTOR_COMBINE_TEST( 3057 ) +CUDA_SCATTERVIEW_TEST( 10 ) + +CUDA_SCATTERVIEW_TEST( 1000000 ) CUDA_INSERT_TEST(close, 100000, 90000, 100, 500) CUDA_INSERT_TEST(far, 100000, 90000, 100, 500) diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index a7ae15921f..860c75b1ac 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -131,11 +131,14 @@ struct TestDynamicView // printf("TestDynamicView::run(%d) construct memory pool\n",arg_total_size); + const size_t total_alloc_size = arg_total_size * sizeof(Scalar) * 1.2 ; + const size_t superblock = std::min( total_alloc_size , size_t(1000000) ); + memory_pool_type pool( memory_space() - , arg_total_size * sizeof(Scalar) * 1.2 + , total_alloc_size , 500 /* min block size in bytes */ , 30000 /* max block size in bytes */ - , 1000000 /* min superblock size in bytes */ + , superblock ); // printf("TestDynamicView::run(%d) construct dynamic view\n",arg_total_size); diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp index 5365d91361..6b2223f418 100644 --- a/lib/kokkos/containers/unit_tests/TestOpenMP.cpp +++ b/lib/kokkos/containers/unit_tests/TestOpenMP.cpp @@ -63,6 +63,8 @@ #include #include +#include + #include #include @@ -152,6 +154,11 @@ TEST_F( openmp , staticcrsgraph ) test_dualview_combinations(size); \ } +#define OPENMP_SCATTERVIEW_TEST( size ) \ + TEST_F( openmp, scatterview_##size##x) { \ + test_scatter_view(size); \ + } + OPENMP_INSERT_TEST(close, 100000, 90000, 100, 500, true) OPENMP_INSERT_TEST(far, 100000, 90000, 100, 500, false) OPENMP_FAILED_INSERT_TEST( 10000, 1000 ) @@ -161,6 +168,10 @@ OPENMP_VECTOR_COMBINE_TEST( 10 ) OPENMP_VECTOR_COMBINE_TEST( 3057 ) OPENMP_DUALVIEW_COMBINE_TEST( 10 ) +OPENMP_SCATTERVIEW_TEST( 10 ) + +OPENMP_SCATTERVIEW_TEST( 1000000 ) + #undef OPENMP_INSERT_TEST #undef OPENMP_FAILED_INSERT_TEST #undef OPENMP_ASSIGNEMENT_TEST diff --git a/lib/kokkos/containers/unit_tests/TestROCm.cpp b/lib/kokkos/containers/unit_tests/TestROCm.cpp new file mode 100644 index 0000000000..b910e881e8 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestROCm.cpp @@ -0,0 +1,263 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#ifdef KOKKOS_ENABLE_ROCM + +#include +#include +#include + +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include + +//---------------------------------------------------------------------------- + + + +namespace Test { + +class rocm : public ::testing::Test { +protected: + static void SetUpTestCase() + { + std::cout << std::setprecision(5) << std::scientific; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + } + static void TearDownTestCase() + { + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +}; + +#if !defined(KOKKOS_ENABLE_ROCM) +//issue 964 +TEST_F( rocm , dyn_view_api) { + TestDynViewAPI< double , Kokkos::Experimental::ROCm >(); +} +#endif + +TEST_F( rocm, viewctorprop_embedded_dim ) { + TestViewCtorProp_EmbeddedDim< Kokkos::Experimental::ROCm >::test_vcpt( 2, 3 ); +} + +TEST_F( rocm , staticcrsgraph ) +{ + TestStaticCrsGraph::run_test_graph< Kokkos::Experimental::ROCm >(); + TestStaticCrsGraph::run_test_graph2< Kokkos::Experimental::ROCm >(); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 0); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 1000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 10000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(1, 100000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 0); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 1000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 10000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(3, 100000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 0); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 1000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 10000); + TestStaticCrsGraph::run_test_graph3< Kokkos::Experimental::ROCm >(75, 100000); +} + + +#if !defined(KOKKOS_ENABLE_ROCM) +// issue 1089 +// same as 130203 (MemPool, static member function link issue +void rocm_test_insert_close( uint32_t num_nodes + , uint32_t num_inserts + , uint32_t num_duplicates + ) +{ + test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, true); +} + +// hcc link error , Referencing function in another module! +void rocm_test_insert_far( uint32_t num_nodes + , uint32_t num_inserts + , uint32_t num_duplicates + ) +{ + test_insert< Kokkos::Experimental::ROCm >( num_nodes, num_inserts, num_duplicates, false); +} + +void rocm_test_failed_insert( uint32_t num_nodes ) +{ + test_failed_insert< Kokkos::Experimental::ROCm >( num_nodes ); +} + +void rocm_test_deep_copy( uint32_t num_nodes ) +{ + test_deep_copy< Kokkos::Experimental::ROCm >( num_nodes ); +} + +void rocm_test_vector_combinations(unsigned int size) +{ + test_vector_combinations(size); +} + +void rocm_test_dualview_combinations(unsigned int size) +{ + test_dualview_combinations(size); +} + +void rocm_test_bitset() +{ + test_bitset(); +} + + + +/*TEST_F( rocm, bitset ) +{ + rocm_test_bitset(); +}*/ + +#define ROCM_INSERT_TEST( name, num_nodes, num_inserts, num_duplicates, repeat ) \ + TEST_F( rocm, UnorderedMap_insert_##name##_##num_nodes##_##num_inserts##_##num_duplicates##_##repeat##x) { \ + for (int i=0; i + typedef TestDynamicView< double , Kokkos::Experimental::ROCmSpace > + TestDynView ; + + for ( int i = 0 ; i < 10 ; ++i ) { + TestDynView::run( 100000 + 100 * i ); + } +} +#endif + + +#if defined(KOKKOS_CLASS_LAMBDA) +TEST_F(rocm, ErrorReporterViaLambda) +{ + TestErrorReporter>(); +} +#endif + +TEST_F(rocm, ErrorReporter) +{ + TestErrorReporter>(); +} + +} + +#else +void KOKKOS_CONTAINERS_UNIT_TESTS_TESTROCM_PREVENT_EMPTY_LINK_ERROR() {} +#endif /* #ifdef KOKKOS_ENABLE_ROCM */ + diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp new file mode 100644 index 0000000000..42e6c09307 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -0,0 +1,156 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_TEST_SCATTER_VIEW_HPP +#define KOKKOS_TEST_SCATTER_VIEW_HPP + +#include + +namespace Test { + +template +void test_scatter_view_config(int n) +{ + Kokkos::View original_view("original_view", n); + { + auto scatter_view = Kokkos::Experimental::create_scatter_view + < Kokkos::Experimental::ScatterSum + , duplication + , contribution + > (original_view); +#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) + auto policy = Kokkos::RangePolicy(0, n); + auto f = KOKKOS_LAMBDA(int i) { + auto scatter_access = scatter_view.access(); + auto scatter_access_atomic = scatter_view.template access(); + for (int j = 0; j < 10; ++j) { + auto k = (i + j) % n; + scatter_access(k, 0) += 4.2; + scatter_access_atomic(k, 1) += 2.0; + scatter_access(k, 2) += 1.0; + } + }; + Kokkos::parallel_for(policy, f, "scatter_view_test"); +#endif + Kokkos::Experimental::contribute(original_view, scatter_view); + scatter_view.reset_except(original_view); +#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) + Kokkos::parallel_for(policy, f, "scatter_view_test"); +#endif + Kokkos::Experimental::contribute(original_view, scatter_view); + } +#if defined( KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA ) + auto host_view = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), original_view); + for (typename decltype(host_view)::size_type i = 0; i < host_view.dimension_0(); ++i) { + auto val0 = host_view(i, 0); + auto val1 = host_view(i, 1); + auto val2 = host_view(i, 2); + EXPECT_TRUE(std::fabs((val0 - 84.0) / 84.0) < 1e-15); + EXPECT_TRUE(std::fabs((val1 - 40.0) / 40.0) < 1e-15); + EXPECT_TRUE(std::fabs((val2 - 20.0) / 20.0) < 1e-15); + } +#endif + { + Kokkos::Experimental::ScatterView + < double*[3] + , Layout + , ExecSpace + , Kokkos::Experimental::ScatterSum + , duplication + , contribution + > + persistent_view("persistent", n); + auto result_view = persistent_view.subview(); + contribute(result_view, persistent_view); + } +} + +template +struct TestDuplicatedScatterView { + TestDuplicatedScatterView(int n) { + test_scatter_view_config(n); + test_scatter_view_config(n); + } +}; + +#ifdef KOKKOS_ENABLE_CUDA +// disable duplicated instantiation with CUDA until +// UniqueToken can support it +template <> +struct TestDuplicatedScatterView { + TestDuplicatedScatterView(int) { + } +}; +#endif + +template +void test_scatter_view(int n) +{ + // all of these configurations should compile okay, but only some of them are + // correct and/or sensible in terms of memory use + Kokkos::Experimental::UniqueToken unique_token{ExecSpace()}; + + // no atomics or duplication is only sensible if the execution space + // is running essentially in serial (doesn't have to be Serial though, + // we also test OpenMP with one thread: LAMMPS cares about that) + if (unique_token.size() == 1) { + test_scatter_view_config(n); + } + test_scatter_view_config(n); + + TestDuplicatedScatterView duptest(n); +} + +} // namespace Test + +#endif //KOKKOS_TEST_UNORDERED_MAP_HPP + + diff --git a/lib/kokkos/containers/unit_tests/TestSerial.cpp b/lib/kokkos/containers/unit_tests/TestSerial.cpp index 1b9b5a2da3..c9b7392d48 100644 --- a/lib/kokkos/containers/unit_tests/TestSerial.cpp +++ b/lib/kokkos/containers/unit_tests/TestSerial.cpp @@ -58,6 +58,7 @@ #include #include #include +#include #include @@ -148,6 +149,11 @@ TEST_F( serial, bitset ) test_dualview_combinations(size); \ } +#define SERIAL_SCATTERVIEW_TEST( size ) \ + TEST_F( serial, scatterview_##size##x) { \ + test_scatter_view(size); \ + } + SERIAL_INSERT_TEST(close, 100000, 90000, 100, 500, true) SERIAL_INSERT_TEST(far, 100000, 90000, 100, 500, false) SERIAL_FAILED_INSERT_TEST( 10000, 1000 ) @@ -157,6 +163,10 @@ SERIAL_VECTOR_COMBINE_TEST( 10 ) SERIAL_VECTOR_COMBINE_TEST( 3057 ) SERIAL_DUALVIEW_COMBINE_TEST( 10 ) +SERIAL_SCATTERVIEW_TEST( 10 ) + +SERIAL_SCATTERVIEW_TEST( 1000000 ) + #undef SERIAL_INSERT_TEST #undef SERIAL_FAILED_INSERT_TEST #undef SERIAL_ASSIGNEMENT_TEST diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index cccb304ec0..aee6ef79b6 100644 --- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -71,7 +71,7 @@ void run_test_graph() } dx = Kokkos::create_staticcrsgraph( "dx" , graph ); - hx = Kokkos::create_mirror( dx ); + hx = Kokkos::create_mirror( dx ); ASSERT_EQ( hx.row_map.dimension_0() - 1 , LENGTH ); @@ -83,6 +83,16 @@ void run_test_graph() ASSERT_EQ( (int) hx.entries( j + begin ) , graph[i][j] ); } } + + // Test row view access + for ( size_t i = 0 ; i < LENGTH ; ++i ) { + auto rowView = hx.rowConst(i); + ASSERT_EQ( rowView.length, graph[i].size() ); + for ( size_t j = 0 ; j < rowView.length ; ++j ) { + ASSERT_EQ( rowView.colidx( j ) , graph[i][j] ); + ASSERT_EQ( rowView( j ) , graph[i][j] ); + } + } } template< class Space > @@ -182,5 +192,6 @@ void run_test_graph3(size_t B, size_t N) ASSERT_FALSE((ne>2*((hx.row_map(hx.numRows())+C*hx.numRows())/B))&&(hx.row_block_offsets(i+1)>hx.row_block_offsets(i)+1)); } } + } /* namespace TestStaticCrsGraph */ diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt index 42fce6b2f2..93db0d2ecf 100644 --- a/lib/kokkos/core/CMakeLists.txt +++ b/lib/kokkos/core/CMakeLists.txt @@ -2,7 +2,9 @@ TRIBITS_SUBPACKAGE(Core) -ADD_SUBDIRECTORY(src) +IF(KOKKOS_HAS_TRILINOS) + ADD_SUBDIRECTORY(src) +ENDIF() TRIBITS_ADD_TEST_DIRECTORIES(unit_test) TRIBITS_ADD_TEST_DIRECTORIES(perf_test) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index 9f19a2a73e..84c49a7713 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -2,6 +2,14 @@ INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +IF(NOT KOKKOS_HAS_TRILINOS) + IF(KOKKOS_SEPARATE_LIBS) + set(TEST_LINK_TARGETS kokkoscore) + ELSE() + set(TEST_LINK_TARGETS kokkos) + ENDIF() +ENDIF() + # warning: PerfTest_CustomReduction.cpp uses # ../../algorithms/src/Kokkos_Random.hpp # we'll just allow it to be included, but note @@ -23,7 +31,7 @@ TRIBITS_ADD_EXECUTABLE( PerfTestExec SOURCES ${SOURCES} COMM serial mpi - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) TRIBITS_ADD_TEST( diff --git a/lib/kokkos/core/perf_test/Makefile b/lib/kokkos/core/perf_test/Makefile index bb9353f583..cdb8e03c1e 100644 --- a/lib/kokkos/core/perf_test/Makefile +++ b/lib/kokkos/core/perf_test/Makefile @@ -17,7 +17,8 @@ endif CXXFLAGS = -O3 #CXXFLAGS += -DGENERIC_REDUCER LINK ?= $(CXX) -LDFLAGS ?= -lpthread +LDFLAGS ?= +override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos @@ -43,6 +44,7 @@ TEST_TARGETS += test-atomic # +ifneq ($(KOKKOS_INTERNAL_USE_ROCM), 1) OBJ_MEMPOOL = test_mempool.o TARGETS += KokkosCore_PerformanceTest_Mempool TEST_TARGETS += test-mempool @@ -52,6 +54,7 @@ TEST_TARGETS += test-mempool OBJ_TASKDAG = test_taskdag.o TARGETS += KokkosCore_PerformanceTest_TaskDAG TEST_TARGETS += test-taskdag +endif # diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index 0d5d97a829..1914b6ba96 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -1,15 +1,4 @@ -TRIBITS_ADD_OPTION_AND_DEFINE( - Kokkos_ENABLE_Serial - KOKKOS_HAVE_SERIAL - "Whether to enable the Kokkos::Serial device. This device executes \"parallel\" kernels sequentially on a single CPU thread. It is enabled by default. If you disable this device, please enable at least one other CPU device, such as Kokkos::OpenMP or Kokkos::Threads." - ON - ) - -ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11) -ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA) - -TRIBITS_CONFIGURE_FILE(${PACKAGE_NAME}_config.h) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) @@ -20,68 +9,90 @@ SET(TRILINOS_INCDIR ${CMAKE_INSTALL_PREFIX}/${${PROJECT_NAME}_INSTALL_INCLUDE_DI #----------------------------------------------------------------------------- -SET(HEADERS_PUBLIC "") -SET(HEADERS_PRIVATE "") -SET(SOURCES "") +IF(KOKKOS_LEGACY_TRIBITS) -FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp) -LIST( APPEND HEADERS_PUBLIC ${CMAKE_CURRENT_BINARY_DIR}/${PACKAGE_NAME}_config.h ) + ASSERT_DEFINED(${PROJECT_NAME}_ENABLE_CXX11) + ASSERT_DEFINED(${PACKAGE_NAME}_ENABLE_CUDA) + + SET(HEADERS_PUBLIC "") + SET(HEADERS_PRIVATE "") + SET(SOURCES "") + + FILE(GLOB HEADERS_PUBLIC Kokkos*.hpp) + LIST( APPEND HEADERS_PUBLIC ${CMAKE_BINARY_DIR}/${PACKAGE_NAME}_config.h ) + + #----------------------------------------------------------------------------- + + FILE(GLOB HEADERS_IMPL impl/*.hpp) + FILE(GLOB SOURCES_IMPL impl/*.cpp) + + LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} ) + LIST(APPEND SOURCES ${SOURCES_IMPL} ) + + INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/) + + #----------------------------------------------------------------------------- + + FILE(GLOB HEADERS_THREADS Threads/*.hpp) + FILE(GLOB SOURCES_THREADS Threads/*.cpp) + + LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} ) + LIST(APPEND SOURCES ${SOURCES_THREADS} ) + + INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/) + + #----------------------------------------------------------------------------- + + FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp) + FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp) + + LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} ) + LIST(APPEND SOURCES ${SOURCES_OPENMP} ) + + INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/) + + #----------------------------------------------------------------------------- + + FILE(GLOB HEADERS_CUDA Cuda/*.hpp) + FILE(GLOB SOURCES_CUDA Cuda/*.cpp) + + LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} ) + LIST(APPEND SOURCES ${SOURCES_CUDA} ) + + INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/) + + #----------------------------------------------------------------------------- + FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp) + FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp) + + LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} ) + LIST(APPEND SOURCES ${SOURCES_QTHREADS} ) + + INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/) + + TRIBITS_ADD_LIBRARY( + kokkoscore + HEADERS ${HEADERS_PUBLIC} + NOINSTALLHEADERS ${HEADERS_PRIVATE} + SOURCES ${SOURCES} + DEPLIBS + ) #----------------------------------------------------------------------------- +# In the new build system, sources are calculated by Makefile.kokkos +else() -FILE(GLOB HEADERS_IMPL impl/*.hpp) -FILE(GLOB SOURCES_IMPL impl/*.cpp) + INSTALL (DIRECTORY + "${CMAKE_CURRENT_SOURCE_DIR}/" + DESTINATION ${TRILINOS_INCDIR} + FILES_MATCHING PATTERN "*.hpp" + ) -LIST(APPEND HEADERS_PRIVATE ${HEADERS_IMPL} ) -LIST(APPEND SOURCES ${SOURCES_IMPL} ) - -INSTALL(FILES ${HEADERS_IMPL} DESTINATION ${TRILINOS_INCDIR}/impl/) + TRIBITS_ADD_LIBRARY( + kokkoscore + SOURCES ${KOKKOS_CORE_SRCS} + DEPLIBS + ) +endif() #----------------------------------------------------------------------------- - -FILE(GLOB HEADERS_THREADS Threads/*.hpp) -FILE(GLOB SOURCES_THREADS Threads/*.cpp) - -LIST(APPEND HEADERS_PRIVATE ${HEADERS_THREADS} ) -LIST(APPEND SOURCES ${SOURCES_THREADS} ) - -INSTALL(FILES ${HEADERS_THREADS} DESTINATION ${TRILINOS_INCDIR}/Threads/) - -#----------------------------------------------------------------------------- - -FILE(GLOB HEADERS_OPENMP OpenMP/*.hpp) -FILE(GLOB SOURCES_OPENMP OpenMP/*.cpp) - -LIST(APPEND HEADERS_PRIVATE ${HEADERS_OPENMP} ) -LIST(APPEND SOURCES ${SOURCES_OPENMP} ) - -INSTALL(FILES ${HEADERS_OPENMP} DESTINATION ${TRILINOS_INCDIR}/OpenMP/) - -#----------------------------------------------------------------------------- - -FILE(GLOB HEADERS_CUDA Cuda/*.hpp) -FILE(GLOB SOURCES_CUDA Cuda/*.cpp) - -LIST(APPEND HEADERS_PRIVATE ${HEADERS_CUDA} ) -LIST(APPEND SOURCES ${SOURCES_CUDA} ) - -INSTALL(FILES ${HEADERS_CUDA} DESTINATION ${TRILINOS_INCDIR}/Cuda/) - -#----------------------------------------------------------------------------- -FILE(GLOB HEADERS_QTHREADS Qthreads/*.hpp) -FILE(GLOB SOURCES_QTHREADS Qthreads/*.cpp) - -LIST(APPEND HEADERS_PRIVATE ${HEADERS_QTHREADS} ) -LIST(APPEND SOURCES ${SOURCES_QTHREADS} ) - -INSTALL(FILES ${HEADERS_QTHREADS} DESTINATION ${TRILINOS_INCDIR}/Qthreads/) - -#----------------------------------------------------------------------------- - -TRIBITS_ADD_LIBRARY( - kokkoscore - HEADERS ${HEADERS_PUBLIC} - NOINSTALLHEADERS ${HEADERS_PRIVATE} - SOURCES ${SOURCES} - DEPLIBS - ) diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp index 341404b9c3..1a5626b04e 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile.hpp @@ -63,7 +63,7 @@ #include #endif -namespace Kokkos { namespace Experimental { namespace Impl { +namespace Kokkos { namespace Impl { // ------------------------------------------------------------------ // @@ -110,21 +110,12 @@ struct apply_impl<2,RP,Functor,void > { // LL if (RP::inner_direction == RP::Left) { - /* - index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y; - index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x; - - for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) { - for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) { - m_func(i, j); - } } -*/ for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1); } @@ -134,21 +125,12 @@ struct apply_impl<2,RP,Functor,void > } // LR else { -/* - index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y; - index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x; - - for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) { - for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) { - m_func(i, j); - } } -*/ for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { m_func(offset_0 , offset_1); } @@ -182,21 +164,12 @@ struct apply_impl<2,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { // Loop over size maxnumblocks until full range covered -/* - index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y; - index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x; - - for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) { - for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) { - m_func(Tag(), i, j); - } } -*/ for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1); } @@ -205,21 +178,12 @@ struct apply_impl<2,RP,Functor,Tag> } } else { -/* - index_type offset_1 = blockIdx.y*m_rp.m_tile[1] + threadIdx.y; - index_type offset_0 = blockIdx.x*m_rp.m_tile[0] + threadIdx.x; - - for ( index_type i = offset_0; i < m_rp.m_upper[0], threadIdx.x < m_rp.m_tile[0]; i += (gridDim.x*m_rp.m_tile[0]) ) { - for ( index_type j = offset_1; j < m_rp.m_upper[1], threadIdx.y < m_rp.m_tile[1]; j += (gridDim.y*m_rp.m_tile[1]) ) { - m_func(Tag(), i, j); - } } -*/ for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { m_func(Tag(), offset_0 , offset_1); } @@ -255,15 +219,15 @@ struct apply_impl<3,RP,Functor,void > // LL if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -276,15 +240,15 @@ struct apply_impl<3,RP,Functor,void > // LR else { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) { m_func(offset_0 , offset_1 , offset_2); } @@ -319,15 +283,15 @@ struct apply_impl<3,RP,Functor,Tag> { if (RP::inner_direction == RP::Left) { for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -339,15 +303,15 @@ struct apply_impl<3,RP,Functor,Tag> } else { for ( index_type tile_id0 = blockIdx.x; tile_id0 < m_rp.m_tile_end[0]; tile_id0 += gridDim.x ) { - const index_type offset_0 = tile_id0*m_rp.m_tile[0] + threadIdx.x; + const index_type offset_0 = tile_id0*m_rp.m_tile[0] + (index_type)threadIdx.x + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && threadIdx.x < m_rp.m_tile[0] ) { for ( index_type tile_id1 = blockIdx.y; tile_id1 < m_rp.m_tile_end[1]; tile_id1 += gridDim.y ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + threadIdx.y; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && threadIdx.y < m_rp.m_tile[1] ) { for ( index_type tile_id2 = blockIdx.z; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.z ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.z; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.z < m_rp.m_tile[2] ) { m_func(Tag(), offset_0 , offset_1 , offset_2); } @@ -398,19 +362,19 @@ struct apply_impl<4,RP,Functor,void > const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -436,19 +400,19 @@ struct apply_impl<4,RP,Functor,void > const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3); } @@ -498,19 +462,19 @@ struct apply_impl<4,RP,Functor,Tag> const index_type thr_id1 = threadIdx.x / m_rp.m_tile[0]; for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) { for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag(), offset_0 , offset_1 , offset_2 , offset_3); } @@ -535,19 +499,19 @@ struct apply_impl<4,RP,Functor,Tag> const index_type thr_id1 = threadIdx.x % m_rp.m_tile[1]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = tile_id1*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type tile_id2 = blockIdx.y; tile_id2 < m_rp.m_tile_end[2]; tile_id2 += gridDim.y ) { - const index_type offset_2 = tile_id2*m_rp.m_tile[2] + threadIdx.y; + const index_type offset_2 = tile_id2*m_rp.m_tile[2] + (index_type)threadIdx.y + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && threadIdx.y < m_rp.m_tile[2] ) { for ( index_type tile_id3 = blockIdx.z; tile_id3 < m_rp.m_tile_end[3]; tile_id3 += gridDim.z ) { - const index_type offset_3 = tile_id3*m_rp.m_tile[3] + threadIdx.z; + const index_type offset_3 = tile_id3*m_rp.m_tile[3] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && threadIdx.z < m_rp.m_tile[3] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3); } @@ -612,23 +576,23 @@ struct apply_impl<5,RP,Functor,void > const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -667,23 +631,23 @@ struct apply_impl<5,RP,Functor,void > const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -747,23 +711,23 @@ struct apply_impl<5,RP,Functor,Tag> const index_type thr_id3 = threadIdx.y / m_rp.m_tile[2]; for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4); } @@ -802,23 +766,23 @@ struct apply_impl<5,RP,Functor,Tag> const index_type thr_id3 = threadIdx.y % m_rp.m_tile[3]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type tile_id4 = blockIdx.z; tile_id4 < m_rp.m_tile_end[4]; tile_id4 += gridDim.z ) { - const index_type offset_4 = tile_id4*m_rp.m_tile[4] + threadIdx.z; + const index_type offset_4 = tile_id4*m_rp.m_tile[4] + (index_type)threadIdx.z + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && threadIdx.z < m_rp.m_tile[4] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4); } @@ -895,27 +859,27 @@ struct apply_impl<6,RP,Functor,void > const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -967,27 +931,27 @@ struct apply_impl<6,RP,Functor,void > const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1064,27 +1028,27 @@ struct apply_impl<6,RP,Functor,Tag> const index_type thr_id5 = threadIdx.z / m_rp.m_tile[4]; for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type j = tile_id1 ; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type i = tile_id0 ; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3, offset_4, offset_5); } @@ -1136,27 +1100,27 @@ struct apply_impl<6,RP,Functor,Tag> const index_type thr_id5 = threadIdx.z % m_rp.m_tile[5]; for ( index_type i = tile_id0; i < m_rp.m_tile_end[0]; i += numbl0 ) { - const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0; + const index_type offset_0 = i*m_rp.m_tile[0] + thr_id0 + (index_type)m_rp.m_lower[0]; if ( offset_0 < m_rp.m_upper[0] && thr_id0 < m_rp.m_tile[0] ) { for ( index_type j = tile_id1; j < m_rp.m_tile_end[1]; j += numbl1 ) { - const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1; + const index_type offset_1 = j*m_rp.m_tile[1] + thr_id1 + (index_type)m_rp.m_lower[1]; if ( offset_1 < m_rp.m_upper[1] && thr_id1 < m_rp.m_tile[1] ) { for ( index_type k = tile_id2; k < m_rp.m_tile_end[2]; k += numbl2 ) { - const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2; + const index_type offset_2 = k*m_rp.m_tile[2] + thr_id2 + (index_type)m_rp.m_lower[2]; if ( offset_2 < m_rp.m_upper[2] && thr_id2 < m_rp.m_tile[2] ) { for ( index_type l = tile_id3; l < m_rp.m_tile_end[3]; l += numbl3 ) { - const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3; + const index_type offset_3 = l*m_rp.m_tile[3] + thr_id3 + (index_type)m_rp.m_lower[3]; if ( offset_3 < m_rp.m_upper[3] && thr_id3 < m_rp.m_tile[3] ) { for ( index_type m = tile_id4; m < m_rp.m_tile_end[4]; m += numbl4 ) { - const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4; + const index_type offset_4 = m*m_rp.m_tile[4] + thr_id4 + (index_type)m_rp.m_lower[4]; if ( offset_4 < m_rp.m_upper[4] && thr_id4 < m_rp.m_tile[4] ) { for ( index_type n = tile_id5; n < m_rp.m_tile_end[5]; n += numbl5 ) { - const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5; + const index_type offset_5 = n*m_rp.m_tile[5] + thr_id5 + (index_type)m_rp.m_lower[5]; if ( offset_5 < m_rp.m_upper[5] && thr_id5 < m_rp.m_tile[5] ) { m_func(Tag() , offset_0 , offset_1 , offset_2 , offset_3 , offset_4 , offset_5); } @@ -1292,7 +1256,7 @@ protected: const Functor m_func; }; -} } } //end namespace Kokkos::Experimental::Impl +} } //end namespace Kokkos::Impl #endif #endif diff --git a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp index c184c14d07..d59c5c6726 100644 --- a/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp +++ b/lib/kokkos/core/src/Cuda/KokkosExp_Cuda_IterateTile_Refactor.hpp @@ -63,7 +63,7 @@ #include #endif -namespace Kokkos { namespace Experimental { namespace Impl { +namespace Kokkos { namespace Impl { namespace Refactor { @@ -2709,7 +2709,7 @@ private: // ---------------------------------------------------------------------------------- -} } } //end namespace Kokkos::Experimental::Impl +} } //end namespace Kokkos::Impl #endif #endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp index 079d9f0889..f55191e98c 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaExec.hpp @@ -164,7 +164,7 @@ static void cuda_parallel_launch_constant_memory() template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -//__launch_bounds__(maxTperB, minBperSM) +__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_constant_memory() { const DriverType & driver = @@ -182,7 +182,7 @@ static void cuda_parallel_launch_local_memory( const DriverType driver ) template< class DriverType, unsigned int maxTperB, unsigned int minBperSM > __global__ -//__launch_bounds__(maxTperB, minBperSM) +__launch_bounds__(maxTperB, minBperSM) static void cuda_parallel_launch_local_memory( const DriverType driver ) { driver(); @@ -193,9 +193,14 @@ template < class DriverType , bool Large = ( CudaTraits::ConstantMemoryUseThreshold < sizeof(DriverType) ) > struct CudaParallelLaunch ; -template < class DriverType, class LaunchBounds > -struct CudaParallelLaunch< DriverType, LaunchBounds, true > { - +template < class DriverType + , unsigned int MaxThreadsPerBlock + , unsigned int MinBlocksPerSM > +struct CudaParallelLaunch< DriverType + , Kokkos::LaunchBounds< MaxThreadsPerBlock + , MinBlocksPerSM > + , true > +{ inline CudaParallelLaunch( const DriverType & driver , const dim3 & grid @@ -216,21 +221,28 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > { if ( CudaTraits::SharedMemoryCapacity < shmem ) { Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); } - #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads - else if ( shmem ) { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) ); - } else { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) ); + #ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + else { + CUDA_SAFE_CALL( + cudaFuncSetCacheConfig + ( cuda_parallel_launch_constant_memory + < DriverType, MaxThreadsPerBlock, MinBlocksPerSM > + , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 ) + ) ); } #endif // Copy functor to constant memory on the device - cudaMemcpyToSymbol( kokkos_impl_cuda_constant_memory_buffer , & driver , sizeof(DriverType) ); + cudaMemcpyToSymbol( + kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) ); KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); // Invoke the driver function on the device - cuda_parallel_launch_constant_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>(); + cuda_parallel_launch_constant_memory + < DriverType, MaxThreadsPerBlock, MinBlocksPerSM > + <<< grid , block , shmem , stream >>>(); #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) CUDA_SAFE_CALL( cudaGetLastError() ); @@ -240,9 +252,11 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, true > { } }; -template < class DriverType, class LaunchBounds > -struct CudaParallelLaunch< DriverType, LaunchBounds, false > { - +template < class DriverType > +struct CudaParallelLaunch< DriverType + , Kokkos::LaunchBounds<> + , true > +{ inline CudaParallelLaunch( const DriverType & driver , const dim3 & grid @@ -252,20 +266,136 @@ struct CudaParallelLaunch< DriverType, LaunchBounds, false > { { if ( grid.x && ( block.x * block.y * block.z ) ) { + if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) < + sizeof( DriverType ) ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); + } + + // Fence before changing settings and copying closure + Kokkos::Cuda::fence(); + if ( CudaTraits::SharedMemoryCapacity < shmem ) { Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); } - #ifndef KOKKOS_ARCH_KEPLER //On Kepler the L1 has no benefit since it doesn't cache reads - else if ( shmem ) { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferShared ) ); - } else { - CUDA_SAFE_CALL( cudaFuncSetCacheConfig( cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM > , cudaFuncCachePreferL1 ) ); + #ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + else { + CUDA_SAFE_CALL( + cudaFuncSetCacheConfig + ( cuda_parallel_launch_constant_memory< DriverType > + , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 ) + ) ); + } + #endif + + // Copy functor to constant memory on the device + cudaMemcpyToSymbol( + kokkos_impl_cuda_constant_memory_buffer, &driver, sizeof(DriverType) ); + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + cuda_parallel_launch_constant_memory< DriverType > + <<< grid , block , shmem , stream >>>(); + +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + CUDA_SAFE_CALL( cudaGetLastError() ); + Kokkos::Cuda::fence(); +#endif + } + } +}; + +template < class DriverType + , unsigned int MaxThreadsPerBlock + , unsigned int MinBlocksPerSM > +struct CudaParallelLaunch< DriverType + , Kokkos::LaunchBounds< MaxThreadsPerBlock + , MinBlocksPerSM > + , false > +{ + inline + CudaParallelLaunch( const DriverType & driver + , const dim3 & grid + , const dim3 & block + , const int shmem + , const cudaStream_t stream = 0 ) + { + if ( grid.x && ( block.x * block.y * block.z ) ) { + + if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) < + sizeof( DriverType ) ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); + } + + if ( CudaTraits::SharedMemoryCapacity < shmem ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); + } + #ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + else { + CUDA_SAFE_CALL( + cudaFuncSetCacheConfig + ( cuda_parallel_launch_local_memory + < DriverType, MaxThreadsPerBlock, MinBlocksPerSM > + , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 ) + ) ); } #endif KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); - cuda_parallel_launch_local_memory< DriverType, LaunchBounds::maxTperB, LaunchBounds::minBperSM ><<< grid , block , shmem , stream >>>( driver ); + // Invoke the driver function on the device + cuda_parallel_launch_local_memory + < DriverType, MaxThreadsPerBlock, MinBlocksPerSM > + <<< grid , block , shmem , stream >>>( driver ); + +#if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) + CUDA_SAFE_CALL( cudaGetLastError() ); + Kokkos::Cuda::fence(); +#endif + } + } +}; + +template < class DriverType > +struct CudaParallelLaunch< DriverType + , Kokkos::LaunchBounds<> + , false > +{ + inline + CudaParallelLaunch( const DriverType & driver + , const dim3 & grid + , const dim3 & block + , const int shmem + , const cudaStream_t stream = 0 ) + { + if ( grid.x && ( block.x * block.y * block.z ) ) { + + if ( sizeof( Kokkos::Impl::CudaTraits::ConstantGlobalBufferType ) < + sizeof( DriverType ) ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: Functor is too large") ); + } + + if ( CudaTraits::SharedMemoryCapacity < shmem ) { + Kokkos::Impl::throw_runtime_exception( std::string("CudaParallelLaunch FAILED: shared memory request is too large") ); + } + #ifndef KOKKOS_ARCH_KEPLER + // On Kepler the L1 has no benefit since it doesn't cache reads + else { + CUDA_SAFE_CALL( + cudaFuncSetCacheConfig + ( cuda_parallel_launch_local_memory< DriverType > + , ( shmem ? cudaFuncCachePreferShared : cudaFuncCachePreferL1 ) + ) ); + } + #endif + + KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); + + // Invoke the driver function on the device + cuda_parallel_launch_local_memory< DriverType > + <<< grid , block , shmem , stream >>>( driver ); #if defined( KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK ) CUDA_SAFE_CALL( cudaGetLastError() ); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index b699f0d6ba..33f77ea835 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -366,7 +366,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >:: if(Kokkos::Profiling::profileLibraryLoaded()) { SharedAllocationHeader header ; - Kokkos::Impl::DeepCopy::DeepCopy( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) ); + Kokkos::Impl::DeepCopy( & header , RecordBase::m_alloc_ptr , sizeof(SharedAllocationHeader) ); Kokkos::Profiling::deallocateData( Kokkos::Profiling::SpaceHandle(Kokkos::CudaSpace::name()),header.m_label, @@ -446,7 +446,7 @@ SharedAllocationRecord( const Kokkos::CudaSpace & arg_space ); // Copy to device memory - Kokkos::Impl::DeepCopy::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) ); + Kokkos::Impl::DeepCopy( RecordBase::m_alloc_ptr , & header , sizeof(SharedAllocationHeader) ); } SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: @@ -655,7 +655,7 @@ SharedAllocationRecord< Kokkos::CudaSpace , void >::get_record( void * alloc_ptr Header const * const head_cuda = alloc_ptr ? Header::get_header( alloc_ptr ) : (Header*) 0 ; if ( alloc_ptr ) { - Kokkos::Impl::DeepCopy::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) ); + Kokkos::Impl::DeepCopy( & head , head_cuda , sizeof(SharedAllocationHeader) ); } RecordCuda * const record = alloc_ptr ? static_cast< RecordCuda * >( head.m_record ) : (RecordCuda *) 0 ; @@ -713,7 +713,7 @@ SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >::get_record( void * // Iterate records to print orphaned memory ... void SharedAllocationRecord< Kokkos::CudaSpace , void >:: -print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail ) +print_records( std::ostream & s , const Kokkos::CudaSpace & , bool detail ) { SharedAllocationRecord< void , void > * r = & s_root_record ; @@ -724,7 +724,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail if ( detail ) { do { if ( r->m_alloc_ptr ) { - Kokkos::Impl::DeepCopy::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); + Kokkos::Impl::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); } else { head.m_label[0] = 0 ; @@ -751,7 +751,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail , reinterpret_cast( r->m_dealloc ) , head.m_label ); - std::cout << buffer ; + s << buffer ; r = r->m_next ; } while ( r != & s_root_record ); } @@ -759,7 +759,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail do { if ( r->m_alloc_ptr ) { - Kokkos::Impl::DeepCopy::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); + Kokkos::Impl::DeepCopy( & head , r->m_alloc_ptr , sizeof(SharedAllocationHeader) ); //Formatting dependent on sizeof(uintptr_t) const char * format_string; @@ -781,7 +781,7 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail else { snprintf( buffer , 256 , "Cuda [ 0 + 0 ]\n" ); } - std::cout << buffer ; + s << buffer ; r = r->m_next ; } while ( r != & s_root_record ); } @@ -789,14 +789,14 @@ print_records( std::ostream & s , const Kokkos::CudaSpace & space , bool detail void SharedAllocationRecord< Kokkos::CudaUVMSpace , void >:: -print_records( std::ostream & s , const Kokkos::CudaUVMSpace & space , bool detail ) +print_records( std::ostream & s , const Kokkos::CudaUVMSpace & , bool detail ) { SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaUVM" , & s_root_record , detail ); } void SharedAllocationRecord< Kokkos::CudaHostPinnedSpace , void >:: -print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & space , bool detail ) +print_records( std::ostream & s , const Kokkos::CudaHostPinnedSpace & , bool detail ) { SharedAllocationRecord< void , void >::print_host_accessible_records( s , "CudaHostPinned" , & s_root_record , detail ); } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp index 80e8f9bd8a..a63fb0cda4 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Impl.cpp @@ -421,7 +421,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) std::string msg = ss.str(); Kokkos::abort( msg.c_str() ); } - if ( compiled_major != cudaProp.major || compiled_minor != cudaProp.minor ) { + if ( Kokkos::show_warnings() && (compiled_major != cudaProp.major || compiled_minor != cudaProp.minor) ) { std::cerr << "Kokkos::Cuda::initialize WARNING: running kernels compiled for compute capability " << compiled_major << "." << compiled_minor << " on device with compute capability " @@ -467,7 +467,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) m_scratchUnifiedSupported = cudaProp.unifiedAddressing ; - if ( ! m_scratchUnifiedSupported ) { + if ( Kokkos::show_warnings() && ! m_scratchUnifiedSupported ) { std::cout << "Kokkos::Cuda device " << cudaProp.name << " capability " << cudaProp.major << "." << cudaProp.minor @@ -545,7 +545,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) } #ifdef KOKKOS_ENABLE_CUDA_UVM - if(!cuda_launch_blocking()) { + if( Kokkos::show_warnings() && !cuda_launch_blocking() ) { std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl; std::cout << " without setting CUDA_LAUNCH_BLOCKING=1." << std::endl; std::cout << " The code must call Cuda::fence() after each kernel" << std::endl; @@ -561,7 +561,7 @@ void CudaInternal::initialize( int cuda_device_id , int stream_count ) bool visible_devices_one=true; if (env_visible_devices == 0) visible_devices_one=false; - if(!visible_devices_one && !force_device_alloc) { + if( Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc) ) { std::cout << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default" << std::endl; std::cout << " without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " << std::endl; std::cout << " setting CUDA_VISIBLE_DEVICES." << std::endl; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index e2eab19e45..5fd442ffc9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -381,12 +381,12 @@ public: // MDRangePolicy impl template< class FunctorType , class ... Traits > class ParallelFor< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , Kokkos::Cuda > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ; + typedef Kokkos::MDRangePolicy< Traits ... > Policy ; using RP = Policy; typedef typename Policy::array_index_type array_index_type; typedef typename Policy::index_type index_type; @@ -402,7 +402,7 @@ public: __device__ void operator()(void) const { - Kokkos::Experimental::Impl::Refactor::DeviceIterateTile(m_rp,m_functor).exec_range(); + Kokkos::Impl::Refactor::DeviceIterateTile(m_rp,m_functor).exec_range(); } @@ -648,10 +648,11 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTagFwd > ValueJoin ; public: @@ -721,7 +722,7 @@ public: } // Reduce with final value at blockDim.y - 1 location. - if ( cuda_single_inter_block_reduce_scan( + if ( cuda_single_inter_block_reduce_scan( ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory() , m_scratch_space , m_scratch_flags ) ) { @@ -731,7 +732,7 @@ public: size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; if ( threadIdx.y == 0 ) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); } if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } @@ -766,11 +767,11 @@ public: value_type init; ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init); - if(Impl::cuda_inter_block_reduction + if(Impl::cuda_inter_block_reduction (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; if(id==0) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); *result = value; } } @@ -858,14 +859,14 @@ public: // MDRangePolicy impl template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , ReducerType , Kokkos::Cuda > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > Policy ; + typedef Kokkos::MDRangePolicy< Traits ... > Policy ; typedef typename Policy::array_index_type array_index_type; typedef typename Policy::index_type index_type; @@ -875,10 +876,11 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTagFwd > ValueJoin ; public: @@ -898,7 +900,7 @@ public: size_type * m_scratch_flags ; size_type * m_unified_space ; - typedef typename Kokkos::Experimental::Impl::Reduce::DeviceIterateTile DeviceIteratePattern; + typedef typename Kokkos::Impl::Reduce::DeviceIterateTile DeviceIteratePattern; // Shall we use the shfl based reduction or not (only use it for static sized types of more than 128bit enum { UseShflReduction = ((sizeof(value_type)>2*sizeof(double)) && ValueTraits::StaticValueSize) }; @@ -913,7 +915,7 @@ public: void exec_range( reference_type update ) const { - Kokkos::Experimental::Impl::Reduce::DeviceIterateTile(m_policy, m_functor, update).exec_range(); + Kokkos::Impl::Reduce::DeviceIterateTile(m_policy, m_functor, update).exec_range(); } inline @@ -942,7 +944,7 @@ public: // Reduce with final value at blockDim.y - 1 location. // Problem: non power-of-two blockDim - if ( cuda_single_inter_block_reduce_scan( + if ( cuda_single_inter_block_reduce_scan( ReducerConditional::select(m_functor , m_reducer) , blockIdx.x , gridDim.x , kokkos_impl_cuda_shared_memory() , m_scratch_space , m_scratch_flags ) ) { @@ -951,7 +953,7 @@ public: size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; if ( threadIdx.y == 0 ) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); } if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } @@ -983,11 +985,11 @@ public: value_type init; ValueInit::init( ReducerConditional::select(m_functor , m_reducer) , &init); - if(Impl::cuda_inter_block_reduction + if(Impl::cuda_inter_block_reduction (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,max_active_thread)) { const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; if(id==0) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); *result = value; } } @@ -1100,10 +1102,11 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTagFwd > ValueJoin ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; @@ -1222,7 +1225,7 @@ public: size_type * const global = m_unified_space ? m_unified_space : m_scratch_space ; if ( threadIdx.y == 0 ) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , shared ); } if ( CudaTraits::WarpSize < word_count.value ) { __syncthreads(); } @@ -1260,7 +1263,7 @@ public: (value,init,ValueJoin(ReducerConditional::select(m_functor , m_reducer)),m_scratch_space,result,m_scratch_flags,blockDim.y)) { const unsigned id = threadIdx.y*blockDim.x + threadIdx.x; if(id==0) { - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , (void*) &value ); *result = value; } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp index 709cbbd534..a478396910 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ReduceScan.hpp @@ -69,7 +69,7 @@ void cuda_shfl( T & out , T const & in , int lane , typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width ) { *reinterpret_cast(&out) = - __shfl( *reinterpret_cast(&in) , lane , width ); + KOKKOS_IMPL_CUDA_SHFL( *reinterpret_cast(&in) , lane , width ); } template< typename T > @@ -83,7 +83,7 @@ void cuda_shfl( T & out , T const & in , int lane , for ( int i = 0 ; i < N ; ++i ) { reinterpret_cast(&out)[i] = - __shfl( reinterpret_cast(&in)[i] , lane , width ); + KOKKOS_IMPL_CUDA_SHFL( reinterpret_cast(&in)[i] , lane , width ); } } @@ -95,7 +95,7 @@ void cuda_shfl_down( T & out , T const & in , int delta , typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width ) { *reinterpret_cast(&out) = - __shfl_down( *reinterpret_cast(&in) , delta , width ); + KOKKOS_IMPL_CUDA_SHFL_DOWN( *reinterpret_cast(&in) , delta , width ); } template< typename T > @@ -109,7 +109,7 @@ void cuda_shfl_down( T & out , T const & in , int delta , for ( int i = 0 ; i < N ; ++i ) { reinterpret_cast(&out)[i] = - __shfl_down( reinterpret_cast(&in)[i] , delta , width ); + KOKKOS_IMPL_CUDA_SHFL_DOWN( reinterpret_cast(&in)[i] , delta , width ); } } @@ -121,7 +121,7 @@ void cuda_shfl_up( T & out , T const & in , int delta , typename std::enable_if< sizeof(int) == sizeof(T) , int >::type width ) { *reinterpret_cast(&out) = - __shfl_up( *reinterpret_cast(&in) , delta , width ); + KOKKOS_IMPL_CUDA_SHFL_UP( *reinterpret_cast(&in) , delta , width ); } template< typename T > @@ -135,7 +135,7 @@ void cuda_shfl_up( T & out , T const & in , int delta , for ( int i = 0 ; i < N ; ++i ) { reinterpret_cast(&out)[i] = - __shfl_up( reinterpret_cast(&in)[i] , delta , width ); + KOKKOS_IMPL_CUDA_SHFL_UP( reinterpret_cast(&in)[i] , delta , width ); } } @@ -268,31 +268,31 @@ bool cuda_inter_block_reduction( typename FunctorValueTraits< FunctorType , ArgT if( id + 1 < int(gridDim.x) ) join(value, tmp); } - int active = __ballot(1); + int active = KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); if( id + 2 < int(gridDim.x) ) join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); if( id + 4 < int(gridDim.x) ) join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); if( id + 8 < int(gridDim.x) ) join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); if( id + 16 < int(gridDim.x) ) join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); } } //The last block has in its thread=0 the global reduction value through "value" @@ -432,31 +432,31 @@ cuda_inter_block_reduction( const ReducerType& reducer, if( id + 1 < int(gridDim.x) ) reducer.join(value, tmp); } - int active = __ballot(1); + int active = KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 2) { value_type tmp = Kokkos::shfl_down(value, 2,32); if( id + 2 < int(gridDim.x) ) reducer.join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 4) { value_type tmp = Kokkos::shfl_down(value, 4,32); if( id + 4 < int(gridDim.x) ) reducer.join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 8) { value_type tmp = Kokkos::shfl_down(value, 8,32); if( id + 8 < int(gridDim.x) ) reducer.join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); if (int(blockDim.x*blockDim.y) > 16) { value_type tmp = Kokkos::shfl_down(value, 16,32); if( id + 16 < int(gridDim.x) ) reducer.join(value, tmp); } - active += __ballot(1); + active += KOKKOS_IMPL_CUDA_BALLOT(1); } } diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp index e11ae4798f..1ff4ff3540 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_UniqueToken.hpp @@ -73,16 +73,16 @@ public: KOKKOS_INLINE_FUNCTION UniqueToken() : m_buffer(0), m_count(0) {} - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION_DEFAULTED UniqueToken( const UniqueToken & ) = default; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION_DEFAULTED UniqueToken( UniqueToken && ) = default; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION_DEFAULTED UniqueToken & operator=( const UniqueToken & ) = default ; - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION_DEFAULTED UniqueToken & operator=( UniqueToken && ) = default ; /// \brief upper bound for acquired values, i.e. 0 <= value < size() diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index 99d8fcc999..264f77b3bc 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -47,7 +47,7 @@ #ifdef KOKKOS_ENABLE_CUDA #include - +#include namespace Kokkos { @@ -91,12 +91,12 @@ namespace Impl { KOKKOS_INLINE_FUNCTION int shfl(const int &val, const int& srcLane, const int& width ) { - return __shfl(val,srcLane,width); + return KOKKOS_IMPL_CUDA_SHFL(val,srcLane,width); } KOKKOS_INLINE_FUNCTION float shfl(const float &val, const int& srcLane, const int& width ) { - return __shfl(val,srcLane,width); + return KOKKOS_IMPL_CUDA_SHFL(val,srcLane,width); } template @@ -105,7 +105,7 @@ namespace Impl { ) { Scalar tmp1 = val; float tmp = *reinterpret_cast(&tmp1); - tmp = __shfl(tmp,srcLane,width); + tmp = KOKKOS_IMPL_CUDA_SHFL(tmp,srcLane,width); return *reinterpret_cast(&tmp); } @@ -113,8 +113,8 @@ namespace Impl { double shfl(const double &val, const int& srcLane, const int& width) { int lo = __double2loint(val); int hi = __double2hiint(val); - lo = __shfl(lo,srcLane,width); - hi = __shfl(hi,srcLane,width); + lo = KOKKOS_IMPL_CUDA_SHFL(lo,srcLane,width); + hi = KOKKOS_IMPL_CUDA_SHFL(hi,srcLane,width); return __hiloint2double(hi,lo); } @@ -123,8 +123,8 @@ namespace Impl { Scalar shfl(const Scalar &val, const int& srcLane, const typename Impl::enable_if< (sizeof(Scalar) == 8) ,int>::type& width) { int lo = __double2loint(*reinterpret_cast(&val)); int hi = __double2hiint(*reinterpret_cast(&val)); - lo = __shfl(lo,srcLane,width); - hi = __shfl(hi,srcLane,width); + lo = KOKKOS_IMPL_CUDA_SHFL(lo,srcLane,width); + hi = KOKKOS_IMPL_CUDA_SHFL(hi,srcLane,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast(&tmp)); } @@ -137,18 +137,18 @@ namespace Impl { s_val = val; for(int i = 0; i @@ -156,7 +156,7 @@ namespace Impl { Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { Scalar tmp1 = val; float tmp = *reinterpret_cast(&tmp1); - tmp = __shfl_down(tmp,delta,width); + tmp = KOKKOS_IMPL_CUDA_SHFL_DOWN(tmp,delta,width); return *reinterpret_cast(&tmp); } @@ -164,8 +164,8 @@ namespace Impl { double shfl_down(const double &val, const int& delta, const int& width) { int lo = __double2loint(val); int hi = __double2hiint(val); - lo = __shfl_down(lo,delta,width); - hi = __shfl_down(hi,delta,width); + lo = KOKKOS_IMPL_CUDA_SHFL_DOWN(lo,delta,width); + hi = KOKKOS_IMPL_CUDA_SHFL_DOWN(hi,delta,width); return __hiloint2double(hi,lo); } @@ -174,8 +174,8 @@ namespace Impl { Scalar shfl_down(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { int lo = __double2loint(*reinterpret_cast(&val)); int hi = __double2hiint(*reinterpret_cast(&val)); - lo = __shfl_down(lo,delta,width); - hi = __shfl_down(hi,delta,width); + lo = KOKKOS_IMPL_CUDA_SHFL_DOWN(lo,delta,width); + hi = KOKKOS_IMPL_CUDA_SHFL_DOWN(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast(&tmp)); } @@ -188,18 +188,18 @@ namespace Impl { s_val = val; for(int i = 0; i @@ -207,7 +207,7 @@ namespace Impl { Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 4) , int >::type & width) { Scalar tmp1 = val; float tmp = *reinterpret_cast(&tmp1); - tmp = __shfl_up(tmp,delta,width); + tmp = KOKKOS_IMPL_CUDA_SHFL_UP(tmp,delta,width); return *reinterpret_cast(&tmp); } @@ -215,8 +215,8 @@ namespace Impl { double shfl_up(const double &val, const int& delta, const int& width ) { int lo = __double2loint(val); int hi = __double2hiint(val); - lo = __shfl_up(lo,delta,width); - hi = __shfl_up(hi,delta,width); + lo = KOKKOS_IMPL_CUDA_SHFL_UP(lo,delta,width); + hi = KOKKOS_IMPL_CUDA_SHFL_UP(hi,delta,width); return __hiloint2double(hi,lo); } @@ -225,8 +225,8 @@ namespace Impl { Scalar shfl_up(const Scalar &val, const int& delta, const typename Impl::enable_if< (sizeof(Scalar) == 8) , int >::type & width) { int lo = __double2loint(*reinterpret_cast(&val)); int hi = __double2hiint(*reinterpret_cast(&val)); - lo = __shfl_up(lo,delta,width); - hi = __shfl_up(hi,delta,width); + lo = KOKKOS_IMPL_CUDA_SHFL_UP(lo,delta,width); + hi = KOKKOS_IMPL_CUDA_SHFL_UP(hi,delta,width); const double tmp = __hiloint2double(hi,lo); return *(reinterpret_cast(&tmp)); } @@ -239,7 +239,7 @@ namespace Impl { s_val = val; for(int i = 0; i +#if ( CUDA_VERSION < 9000 ) +#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot(x) +#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl(x,y,z) +#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up(x,y,z) +#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down(x,y,z) +#else +#define KOKKOS_IMPL_CUDA_BALLOT(x) __ballot_sync(0xffffffff,x) +#define KOKKOS_IMPL_CUDA_SHFL(x,y,z) __shfl_sync(0xffffffff,x,y,z) +#define KOKKOS_IMPL_CUDA_SHFL_UP(x,y,z) __shfl_up_sync(0xffffffff,x,y,z) +#define KOKKOS_IMPL_CUDA_SHFL_DOWN(x,y,z) __shfl_down_sync(0xffffffff,x,y,z) +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index d641622bb6..32ee7d0e59 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -127,11 +127,11 @@ struct CudaTextureFetch { template< class CudaMemorySpace > inline explicit CudaTextureFetch( const ValueType * const arg_ptr - , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > & record + , Kokkos::Impl::SharedAllocationRecord< CudaMemorySpace , void > * record ) - : m_obj( record.template attach_texture_object< AliasType >() ) + : m_obj( record->template attach_texture_object< AliasType >() ) , m_ptr( arg_ptr ) - , m_offset( record.attach_texture_object_offset( reinterpret_cast( arg_ptr ) ) ) + , m_offset( record->attach_texture_object_offset( reinterpret_cast( arg_ptr ) ) ) {} // Texture object spans the entire allocation. @@ -199,8 +199,8 @@ struct CudaLDGFetch { template< class CudaMemorySpace > inline explicit CudaLDGFetch( const ValueType * const arg_ptr - , Kokkos::Experimental::Impl::SharedAllocationRecord< CudaMemorySpace , void > const & - ) + , Kokkos::Impl::SharedAllocationRecord* + ) : m_ptr( arg_ptr ) {} @@ -285,7 +285,21 @@ public: // Assignment of texture = non-texture requires creation of a texture object // which can only occur on the host. In addition, 'get_record' is only valid // if called in a host execution space - return handle_type( arg_data_ptr , arg_tracker.template get_record< typename Traits::memory_space >() ); + + + typedef typename Traits::memory_space memory_space ; + typedef typename Impl::SharedAllocationRecord record ; + + record * const r = arg_tracker.template get_record< memory_space >(); + +#if ! defined( KOKKOS_ENABLE_CUDA_LDG_INTRINSIC ) + if ( 0 == r ) { + Kokkos::abort("Cuda const random access View using Cuda texture memory requires Kokkos to allocate the View's memory"); + } +#endif + + return handle_type( arg_data_ptr , r ); + #else Kokkos::Impl::cuda_abort("Cannot create Cuda texture object from within a Cuda kernel"); return handle_type(); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index 99778c64b1..9f5ab1b1f2 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -48,50 +48,52 @@ namespace Kokkos { namespace Impl { template< class FunctorType , class ... Traits > -class ParallelFor< FunctorType , - Kokkos::Experimental::WorkGraphPolicy< Traits ... > , - Kokkos::Cuda +class ParallelFor< FunctorType + , Kokkos::WorkGraphPolicy< Traits ... > + , Kokkos::Cuda > - : public Kokkos::Impl::Experimental:: - WorkGraphExec< FunctorType, - Kokkos::Cuda, - Traits ... - > { public: - typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; - typedef Kokkos::Impl::Experimental:: - WorkGraphExec Base ; + typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ; typedef ParallelFor Self ; private: - template< class TagType > - __device__ - typename std::enable_if< std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - Base::m_functor( i ); - } + Policy m_policy ; + FunctorType m_functor ; template< class TagType > - __device__ + __device__ inline + typename std::enable_if< std::is_same< TagType , void >::value >::type + exec_one( const std::int32_t w ) const noexcept + { m_functor( w ); } + + template< class TagType > + __device__ inline typename std::enable_if< ! std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - const TagType t{} ; - Base::m_functor( t , i ); - } + exec_one( const std::int32_t w ) const noexcept + { const TagType t{} ; m_functor( t , w ); } public: - __device__ - inline - void operator()() const { - for (std::int32_t i; (-1 != (i = Base::before_work())); ) { - exec_one< typename Policy::work_tag >( i ); - Base::after_work(i); + __device__ inline + void operator()() const noexcept + { + if ( 0 == ( threadIdx.y % 16 ) ) { + + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for ( std::int32_t w = Policy::END_TOKEN ; + Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) { + if ( Policy::END_TOKEN != w ) { + exec_one< typename Policy::work_tag >( w ); + m_policy.completed_work(w); + } + } + } } - } inline void execute() @@ -108,9 +110,9 @@ public: inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : Base( arg_functor, arg_policy ) - { - } + : m_policy( arg_policy ) + , m_functor( arg_functor ) + {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 6ef7443a14..9486f8d26a 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -55,7 +55,7 @@ #include #endif -namespace Kokkos { namespace Experimental { +namespace Kokkos { // ------------------------------------------------------------------ // @@ -331,11 +331,23 @@ struct MDRangePolicy } }; + +} // namespace Kokkos + +// For backward compatibility +namespace Kokkos { namespace Experimental { + using Kokkos::MDRangePolicy; + using Kokkos::Rank; + using Kokkos::Iterate; +} } // end Kokkos::Experimental // ------------------------------------------------------------------ // // ------------------------------------------------------------------ // //md_parallel_for - deprecated use parallel_for // ------------------------------------------------------------------ // + +namespace Kokkos { namespace Experimental { + template void md_parallel_for( MDRange const& range , Functor const& f @@ -347,7 +359,7 @@ void md_parallel_for( MDRange const& range ) >::type* = 0 ) { - Impl::MDFunctor g(range, f); + Kokkos::Impl::Experimental::MDFunctor g(range, f); using range_policy = typename MDRange::impl_range_policy; @@ -365,7 +377,7 @@ void md_parallel_for( const std::string& str ) >::type* = 0 ) { - Impl::MDFunctor g(range, f); + Kokkos::Impl::Experimental::MDFunctor g(range, f); using range_policy = typename MDRange::impl_range_policy; @@ -385,7 +397,7 @@ void md_parallel_for( const std::string& str ) >::type* = 0 ) { - Impl::DeviceIterateTile closure(range, f); + Kokkos::Impl::DeviceIterateTile closure(range, f); closure.execute(); } @@ -400,7 +412,7 @@ void md_parallel_for( MDRange const& range ) >::type* = 0 ) { - Impl::DeviceIterateTile closure(range, f); + Kokkos::Impl::DeviceIterateTile closure(range, f); closure.execute(); } #endif @@ -421,7 +433,7 @@ void md_parallel_reduce( MDRange const& range ) >::type* = 0 ) { - Impl::MDFunctor g(range, f); + Kokkos::Impl::Experimental::MDFunctor g(range, f); using range_policy = typename MDRange::impl_range_policy; Kokkos::parallel_reduce( str, range_policy(0, range.m_num_tiles).set_chunk_size(1), g, v ); @@ -439,7 +451,7 @@ void md_parallel_reduce( const std::string& str ) >::type* = 0 ) { - Impl::MDFunctor g(range, f); + Kokkos::Impl::Experimental::MDFunctor g(range, f); using range_policy = typename MDRange::impl_range_policy; @@ -448,7 +460,7 @@ void md_parallel_reduce( const std::string& str // Cuda - md_parallel_reduce not implemented - use parallel_reduce -}} // namespace Kokkos::Experimental +} } // namespace Kokkos::Experimental #endif //KOKKOS_CORE_EXP_MD_RANGE_POLICY_HPP diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index 5480dbf40c..f9366a9594 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -81,10 +81,10 @@ struct IndexType /**\brief Specify Launch Bounds for CUDA execution. * - * The "best" defaults may be architecture specific. + * If no launch bounds specified then do not set launch bounds. */ -template< unsigned int maxT = 1024 /* Max threads per block */ - , unsigned int minB = 1 /* Min blocks per SM */ +template< unsigned int maxT = 0 /* Max threads per block */ + , unsigned int minB = 0 /* Min blocks per SM */ > struct LaunchBounds { @@ -280,6 +280,9 @@ struct MemorySpaceAccess { enum { deepcopy = assignable }; }; +}} // namespace Kokkos::Impl + +namespace Kokkos { /**\brief Can AccessSpace access MemorySpace ? * @@ -358,6 +361,13 @@ public: >::type space ; }; +} // namespace Kokkos + +namespace Kokkos { +namespace Impl { + +using Kokkos::SpaceAccessibility ; // For backward compatibility + }} // namespace Kokkos::Impl //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 3748c35eb0..b1531a0a1b 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -99,13 +99,17 @@ struct InitArguments { int num_threads; int num_numa; int device_id; + bool disable_warnings; InitArguments( int nt = -1 , int nn = -1 - , int dv = -1) - : num_threads( nt ) - , num_numa( nn ) - , device_id( dv ) + , int dv = -1 + , bool dw = false + ) + : num_threads{ nt } + , num_numa{ nn } + , device_id{ dv } + , disable_warnings{ dw } {} }; @@ -113,6 +117,10 @@ void initialize(int& narg, char* arg[]); void initialize(const InitArguments& args = InitArguments()); +bool is_initialized() noexcept; + +bool show_warnings() noexcept; + /** \brief Finalize the spaces that were initialized via Kokkos::initialize */ void finalize(); diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index b9c131cd7a..7bbc938010 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -45,7 +45,6 @@ #define KOKKOS_CRS_HPP namespace Kokkos { -namespace Experimental { /// \class Crs /// \brief Compressed row storage array. @@ -164,7 +163,7 @@ void transpose_crs( Crs& out, Crs const& in); -}} // namespace Kokkos::Experimental +} // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -172,7 +171,6 @@ void transpose_crs( namespace Kokkos { namespace Impl { -namespace Experimental { template class GetCrsTransposeCounts { @@ -277,14 +275,13 @@ class FillCrsTransposeEntries { } }; -}}} // namespace Kokkos::Impl::Experimental +}} // namespace Kokkos::Impl /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { -namespace Experimental { template< class OutCounts, class DataType, @@ -297,8 +294,7 @@ void get_crs_transpose_counts( std::string const& name) { using InCrs = Crs; out = OutCounts(name, in.numRows()); - Kokkos::Impl::Experimental:: - GetCrsTransposeCounts functor(in, out); + Kokkos::Impl::GetCrsTransposeCounts functor(in, out); } template< class OutRowMap, @@ -308,8 +304,7 @@ typename OutRowMap::value_type get_crs_row_map_from_counts( InCounts const& in, std::string const& name) { out = OutRowMap(ViewAllocateWithoutInitializing(name), in.size() + 1); - Kokkos::Impl::Experimental:: - CrsRowMapFromCounts functor(in, out); + Kokkos::Impl::CrsRowMapFromCounts functor(in, out); return functor.execute(); } @@ -326,32 +321,37 @@ void transpose_crs( typedef View counts_type ; { counts_type counts; - Kokkos::Experimental::get_crs_transpose_counts(counts, in); - Kokkos::Experimental::get_crs_row_map_from_counts(out.row_map, counts, + Kokkos::get_crs_transpose_counts(counts, in); + Kokkos::get_crs_row_map_from_counts(out.row_map, counts, "tranpose_row_map"); } out.entries = decltype(out.entries)("transpose_entries", in.entries.size()); - Kokkos::Impl::Experimental:: + Kokkos::Impl:: FillCrsTransposeEntries entries_functor(in, out); } template< class CrsType, - class Functor> -struct CountAndFill { + class Functor, + class ExecutionSpace = typename CrsType::execution_space> +struct CountAndFillBase; + +template< class CrsType, + class Functor, + class ExecutionSpace> +struct CountAndFillBase { using data_type = typename CrsType::size_type; using size_type = typename CrsType::size_type; using row_map_type = typename CrsType::row_map_type; - using entries_type = typename CrsType::entries_type; using counts_type = row_map_type; CrsType m_crs; Functor m_functor; counts_type m_counts; struct Count {}; - KOKKOS_INLINE_FUNCTION void operator()(Count, size_type i) const { + inline void operator()(Count, size_type i) const { m_counts(i) = m_functor(i, nullptr); } struct Fill {}; - KOKKOS_INLINE_FUNCTION void operator()(Fill, size_type i) const { + inline void operator()(Fill, size_type i) const { auto j = m_crs.row_map(i); /* we don't want to access entries(entries.size()), even if its just to get its address and never use it. @@ -363,13 +363,63 @@ struct CountAndFill { nullptr : (&(m_crs.entries(j))); m_functor(i, fill); } - using self_type = CountAndFill; - CountAndFill(CrsType& crs, size_type nrows, Functor const& f): + CountAndFillBase(CrsType& crs, Functor const& f): m_crs(crs), m_functor(f) + {} +}; + +#if defined( KOKKOS_ENABLE_CUDA ) +template< class CrsType, + class Functor> +struct CountAndFillBase { + using data_type = typename CrsType::size_type; + using size_type = typename CrsType::size_type; + using row_map_type = typename CrsType::row_map_type; + using counts_type = row_map_type; + CrsType m_crs; + Functor m_functor; + counts_type m_counts; + struct Count {}; + __device__ inline void operator()(Count, size_type i) const { + m_counts(i) = m_functor(i, nullptr); + } + struct Fill {}; + __device__ inline void operator()(Fill, size_type i) const { + auto j = m_crs.row_map(i); + /* we don't want to access entries(entries.size()), even if its just to get its + address and never use it. + this can happen when row (i) is empty and all rows after it are also empty. + we could compare to row_map(i + 1), but that is a read from global memory, + whereas dimension_0() should be part of the View in registers (or constant memory) */ + data_type* fill = + (j == static_cast(m_crs.entries.dimension_0())) ? + nullptr : (&(m_crs.entries(j))); + m_functor(i, fill); + } + CountAndFillBase(CrsType& crs, Functor const& f): + m_crs(crs), + m_functor(f) + {} +}; +#endif + +template< class CrsType, + class Functor> +struct CountAndFill : public CountAndFillBase { + using base_type = CountAndFillBase; + using typename base_type::data_type; + using typename base_type::size_type; + using typename base_type::counts_type; + using typename base_type::Count; + using typename base_type::Fill; + using entries_type = typename CrsType::entries_type; + using self_type = CountAndFill; + CountAndFill(CrsType& crs, size_type nrows, Functor const& f): + base_type(crs, f) { using execution_space = typename CrsType::execution_space; - m_counts = counts_type("counts", nrows); + this->m_counts = counts_type("counts", nrows); { using count_policy_type = RangePolicy; using count_closure_type = @@ -377,10 +427,10 @@ struct CountAndFill { const count_closure_type closure(*this, count_policy_type(0, nrows)); closure.execute(); } - auto nentries = Kokkos::Experimental:: - get_crs_row_map_from_counts(m_crs.row_map, m_counts); - m_counts = counts_type(); - m_crs.entries = entries_type("entries", nentries); + auto nentries = Kokkos:: + get_crs_row_map_from_counts(this->m_crs.row_map, this->m_counts); + this->m_counts = counts_type(); + this->m_crs.entries = entries_type("entries", nentries); { using fill_policy_type = RangePolicy; using fill_closure_type = @@ -388,7 +438,7 @@ struct CountAndFill { const fill_closure_type closure(*this, fill_policy_type(0, nrows)); closure.execute(); } - crs = m_crs; + crs = this->m_crs; } }; @@ -398,9 +448,9 @@ void count_and_fill_crs( CrsType& crs, typename CrsType::size_type nrows, Functor const& f) { - Kokkos::Experimental::CountAndFill(crs, nrows, f); + Kokkos::CountAndFill(crs, nrows, f); } -}} // namespace Kokkos::Experimental +} // namespace Kokkos #endif /* #define KOKKOS_CRS_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index a8c4d77c62..6f6343713c 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -379,12 +379,13 @@ Impl::PerThreadValue PerThread(const int& arg); * uses variadic templates. Each and any of the template arguments can * be omitted. * - * Possible Template arguments and there default values: + * Possible Template arguments and their default values: * ExecutionSpace (DefaultExecutionSpace): where to execute code. Must be enabled. * WorkTag (none): Tag which is used as the first argument for the functor operator. * Schedule (Schedule): Scheduling Policy (Dynamic, or Static). * IndexType (IndexType: Integer Index type used to iterate over the Index space. - * LaunchBounds (LaunchBounds<1024,1>: Launch Bounds for CUDA compilation. + * LaunchBounds Launch Bounds for CUDA compilation, + * default of LaunchBounds<0,0> indicates no launch bounds specified. */ template< class ... Properties> class TeamPolicy: public diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 7137eaae4b..b51f38efb2 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -251,7 +251,7 @@ #endif #endif -#if defined( __PGIC__ ) && !defined( __GNUC__ ) +#if defined( __PGIC__ ) #define KOKKOS_COMPILER_PGI __PGIC__*100+__PGIC_MINOR__*10+__PGIC_PATCHLEVEL__ #if ( 1540 > KOKKOS_COMPILER_PGI ) @@ -268,24 +268,22 @@ #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 - #define KOKKOS_ENABLE_PRAGMA_SIMD 1 + #if ( 1800 > KOKKOS_COMPILER_INTEL ) + #define KOKKOS_ENABLE_PRAGMA_SIMD 1 + #endif #if ( __INTEL_COMPILER > 1400 ) #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 #endif + #if ! defined( KOKKOS_MEMORY_ALIGNMENT ) + #define KOKKOS_MEMORY_ALIGNMENT 64 + #endif + #define KOKKOS_RESTRICT __restrict__ - #ifndef KOKKOS_ALIGN - #define KOKKOS_ALIGN(size) __attribute__((aligned(size))) - #endif - - #ifndef KOKKOS_ALIGN_PTR - #define KOKKOS_ALIGN_PTR(size) __attribute__((align_value(size))) - #endif - - #ifndef KOKKOS_ALIGN_SIZE - #define KOKKOS_ALIGN_SIZE 64 + #ifndef KOKKOS_IMPL_ALIGN_PTR + #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((align_value(size))) #endif #if ( 1400 > KOKKOS_COMPILER_INTEL ) @@ -351,6 +349,11 @@ #if !defined( KOKKOS_FORCEINLINE_FUNCTION ) #define KOKKOS_FORCEINLINE_FUNCTION inline __attribute__((always_inline)) #endif + + #if !defined( KOKKOS_IMPL_ALIGN_PTR ) + #define KOKKOS_IMPL_ALIGN_PTR(size) __attribute__((aligned(size))) + #endif + #endif //---------------------------------------------------------------------------- @@ -426,16 +429,16 @@ //---------------------------------------------------------------------------- // Define Macro for alignment: -#if !defined KOKKOS_ALIGN_SIZE - #define KOKKOS_ALIGN_SIZE 16 +#if ! defined( KOKKOS_MEMORY_ALIGNMENT ) + #define KOKKOS_MEMORY_ALIGNMENT 16 #endif -#if !defined( KOKKOS_ALIGN ) - #define KOKKOS_ALIGN(size) __attribute__((aligned(size))) +#if ! defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD ) + #define KOKKOS_MEMORY_ALIGNMENT_THRESHOLD 4 #endif -#if !defined( KOKKOS_ALIGN_PTR ) - #define KOKKOS_ALIGN_PTR(size) __attribute__((aligned(size))) +#if !defined( KOKKOS_IMPL_ALIGN_PTR ) + #define KOKKOS_IMPL_ALIGN_PTR(size) /* */ #endif //---------------------------------------------------------------------------- @@ -510,5 +513,11 @@ #define KOKKOS_ENABLE_TASKDAG #endif + +#if defined ( KOKKOS_ENABLE_CUDA ) + #if ( 9000 <= CUDA_VERSION ) + #define KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND + #endif +#endif #endif // #ifndef KOKKOS_MACROS_HPP diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index 4ba5812f9e..9199725767 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -51,6 +51,27 @@ #include #include +namespace Kokkos { +namespace Impl { +/* Report violation of size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ +void memory_pool_bounds_verification + ( size_t min_block_alloc_size + , size_t max_block_alloc_size + , size_t min_superblock_size + , size_t max_superblock_size + , size_t max_block_per_superblock + , size_t min_total_alloc_size + ); +} +} + namespace Kokkos { template< typename DeviceType > @@ -111,6 +132,10 @@ private: public: + /**\brief The maximum size of a superblock and block */ + enum : uint32_t { max_superblock_size = 1LU << 31 /* 2 gigabytes */ }; + enum : uint32_t { max_block_per_superblock = max_bit_count }; + //-------------------------------------------------------------------------- KOKKOS_INLINE_FUNCTION @@ -206,7 +231,7 @@ public: const uint32_t * sb_state_ptr = sb_state_array ; s << "pool_size(" << ( size_t(m_sb_count) << m_sb_size_lg2 ) << ")" - << " superblock_size(" << ( 1 << m_sb_size_lg2 ) << ")" << std::endl ; + << " superblock_size(" << ( 1LU << m_sb_size_lg2 ) << ")" << std::endl ; for ( int32_t i = 0 ; i < m_sb_count ; ++i , sb_state_ptr += m_sb_state_size ) { @@ -215,7 +240,7 @@ public: const uint32_t block_count_lg2 = (*sb_state_ptr) >> state_shift ; const uint32_t block_size_lg2 = m_sb_size_lg2 - block_count_lg2 ; - const uint32_t block_count = 1 << block_count_lg2 ; + const uint32_t block_count = 1u << block_count_lg2 ; const uint32_t block_used = (*sb_state_ptr) & state_used_mask ; s << "Superblock[ " << i << " / " << m_sb_count << " ] {" @@ -284,43 +309,71 @@ public: { const uint32_t int_align_lg2 = 3 ; /* align as int[8] */ const uint32_t int_align_mask = ( 1u << int_align_lg2 ) - 1 ; + const uint32_t default_min_block_size = 1u << 6 ; /* 64 bytes */ + const uint32_t default_max_block_size = 1u << 12 ;/* 4k bytes */ + const uint32_t default_min_superblock_size = 1u << 20 ;/* 1M bytes */ - // Constraints and defaults: - // min_block_alloc_size <= max_block_alloc_size - // max_block_alloc_size <= min_superblock_size - // min_superblock_size <= min_total_alloc_size + //-------------------------------------------------- + // Default block and superblock sizes: - const uint32_t MIN_BLOCK_SIZE = 1u << 6 /* 64 bytes */ ; - const uint32_t MAX_BLOCK_SIZE = 1u << 12 /* 4k bytes */ ; + if ( 0 == min_block_alloc_size ) { + // Default all sizes: - if ( 0 == min_block_alloc_size ) min_block_alloc_size = MIN_BLOCK_SIZE ; + min_superblock_size = + std::min( size_t(default_min_superblock_size) + , min_total_alloc_size ); + + min_block_alloc_size = + std::min( size_t(default_min_block_size) + , min_superblock_size ); + + max_block_alloc_size = + std::min( size_t(default_max_block_size) + , min_superblock_size ); + } + else if ( 0 == min_superblock_size ) { + + // Choose superblock size as minimum of: + // max_block_per_superblock * min_block_size + // max_superblock_size + // min_total_alloc_size + + const size_t max_superblock = + min_block_alloc_size * max_block_per_superblock ; + + min_superblock_size = + std::min( max_superblock , + std::min( size_t(max_superblock_size) + , min_total_alloc_size ) ); + } if ( 0 == max_block_alloc_size ) { - - max_block_alloc_size = MAX_BLOCK_SIZE ; - - // Upper bound of total allocation size - max_block_alloc_size = std::min( size_t(max_block_alloc_size) - , min_total_alloc_size ); - - // Lower bound of minimum block size - max_block_alloc_size = std::max( max_block_alloc_size - , min_block_alloc_size ); + max_block_alloc_size = min_superblock_size ; } - if ( 0 == min_superblock_size ) { - min_superblock_size = max_block_alloc_size ; + //-------------------------------------------------- - // Upper bound of total allocation size - min_superblock_size = std::min( size_t(min_superblock_size) - , min_total_alloc_size ); + /* Enforce size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ - // Lower bound of maximum block size - min_superblock_size = std::max( min_superblock_size - , max_block_alloc_size ); - } + Kokkos::Impl::memory_pool_bounds_verification + ( min_block_alloc_size + , max_block_alloc_size + , min_superblock_size + , max_superblock_size + , max_block_per_superblock + , min_total_alloc_size + ); + //-------------------------------------------------- // Block and superblock size is power of two: + // Maximum value is 'max_superblock_size' m_min_block_size_lg2 = Kokkos::Impl::integral_power_of_two_that_contains(min_block_alloc_size); @@ -331,45 +384,26 @@ public: m_sb_size_lg2 = Kokkos::Impl::integral_power_of_two_that_contains(min_superblock_size); - // Constraints: - // m_min_block_size_lg2 <= m_max_block_size_lg2 <= m_sb_size_lg2 - // m_sb_size_lg2 <= m_min_block_size + max_bit_count_lg2 + { + // number of superblocks is multiple of superblock size that + // can hold min_total_alloc_size. - if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_sb_size_lg2 ) { - m_min_block_size_lg2 = m_sb_size_lg2 - max_bit_count_lg2 ; - } - if ( m_min_block_size_lg2 + max_bit_count_lg2 < m_max_block_size_lg2 ) { - m_min_block_size_lg2 = m_max_block_size_lg2 - max_bit_count_lg2 ; - } - if ( m_max_block_size_lg2 < m_min_block_size_lg2 ) { - m_max_block_size_lg2 = m_min_block_size_lg2 ; - } - if ( m_sb_size_lg2 < m_max_block_size_lg2 ) { - m_sb_size_lg2 = m_max_block_size_lg2 ; + const uint64_t sb_size_mask = ( 1LU << m_sb_size_lg2 ) - 1 ; + + m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ; } - // At least 32 minimum size blocks in a superblock + { + // Any superblock can be assigned to the smallest size block + // Size the block bitset to maximum number of blocks - if ( m_sb_size_lg2 < m_min_block_size_lg2 + 5 ) { - m_sb_size_lg2 = m_min_block_size_lg2 + 5 ; + const uint32_t max_block_count_lg2 = + m_sb_size_lg2 - m_min_block_size_lg2 ; + + m_sb_state_size = + ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ; } - // number of superblocks is multiple of superblock size that - // can hold min_total_alloc_size. - - const uint32_t sb_size_mask = ( 1u << m_sb_size_lg2 ) - 1 ; - - m_sb_count = ( min_total_alloc_size + sb_size_mask ) >> m_sb_size_lg2 ; - - // Any superblock can be assigned to the smallest size block - // Size the block bitset to maximum number of blocks - - const uint32_t max_block_count_lg2 = - m_sb_size_lg2 - m_min_block_size_lg2 ; - - m_sb_state_size = - ( CB::buffer_bound_lg2( max_block_count_lg2 ) + int_align_mask ) & ~int_align_mask ; - // Array of all superblock states const size_t all_sb_state_size = @@ -454,7 +488,7 @@ private: * Restrict lower bound to minimum block size. */ KOKKOS_FORCEINLINE_FUNCTION - unsigned get_block_size_lg2( unsigned n ) const noexcept + uint32_t get_block_size_lg2( uint32_t n ) const noexcept { const unsigned i = Kokkos::Impl::integral_power_of_two_that_contains( n ); @@ -463,11 +497,12 @@ private: public: + /* Return 0 for invalid block size */ KOKKOS_INLINE_FUNCTION - uint32_t allocate_block_size( uint32_t alloc_size ) const noexcept + uint32_t allocate_block_size( uint64_t alloc_size ) const noexcept { return alloc_size <= (1UL << m_max_block_size_lg2) - ? ( 1u << get_block_size_lg2( alloc_size ) ) + ? ( 1UL << get_block_size_lg2( uint32_t(alloc_size) ) ) : 0 ; } @@ -485,246 +520,253 @@ public: void * allocate( size_t alloc_size , int32_t attempt_limit = 1 ) const noexcept { + if ( size_t(1LU << m_max_block_size_lg2) < alloc_size ) { + Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size"); + } + if ( 0 == alloc_size ) return (void*) 0 ; void * p = 0 ; const uint32_t block_size_lg2 = get_block_size_lg2( alloc_size ); - if ( block_size_lg2 <= m_max_block_size_lg2 ) { + // Allocation will fit within a superblock + // that has block sizes ( 1 << block_size_lg2 ) - // Allocation will fit within a superblock - // that has block sizes ( 1 << block_size_lg2 ) + const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; + const uint32_t block_state = block_count_lg2 << state_shift ; + const uint32_t block_count = 1u << block_count_lg2 ; - const uint32_t block_count_lg2 = m_sb_size_lg2 - block_size_lg2 ; - const uint32_t block_state = block_count_lg2 << state_shift ; - const uint32_t block_count = 1u << block_count_lg2 ; + // Superblock hints for this block size: + // hint_sb_id_ptr[0] is the dynamically changing hint + // hint_sb_id_ptr[1] is the static start point - // Superblock hints for this block size: - // hint_sb_id_ptr[0] is the dynamically changing hint - // hint_sb_id_ptr[1] is the static start point + volatile uint32_t * const hint_sb_id_ptr + = m_sb_state_array /* memory pool state array */ + + m_hint_offset /* offset to hint portion of array */ + + HINT_PER_BLOCK_SIZE /* number of hints per block size */ + * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */ - volatile uint32_t * const hint_sb_id_ptr - = m_sb_state_array /* memory pool state array */ - + m_hint_offset /* offset to hint portion of array */ - + HINT_PER_BLOCK_SIZE /* number of hints per block size */ - * ( block_size_lg2 - m_min_block_size_lg2 ); /* block size id */ + const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] ); - const int32_t sb_id_begin = int32_t( hint_sb_id_ptr[1] ); + // Fast query clock register 'tic' to pseudo-randomize + // the guess for which block within a superblock should + // be claimed. If not available then a search occurs. - // Fast query clock register 'tic' to pseudo-randomize - // the guess for which block within a superblock should - // be claimed. If not available then a search occurs. - - const uint32_t block_id_hint = - (uint32_t)( Kokkos::Impl::clock_tic() + const uint32_t block_id_hint = + (uint32_t)( Kokkos::Impl::clock_tic() #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) - // Spread out potentially concurrent access - // by threads within a warp or thread block. - + ( threadIdx.x + blockDim.x * threadIdx.y ) + // Spread out potentially concurrent access + // by threads within a warp or thread block. + + ( threadIdx.x + blockDim.x * threadIdx.y ) #endif - ); + ); - // expected state of superblock for allocation - uint32_t sb_state = block_state ; + // expected state of superblock for allocation + uint32_t sb_state = block_state ; - int32_t sb_id = -1 ; + int32_t sb_id = -1 ; - volatile uint32_t * sb_state_array = 0 ; + volatile uint32_t * sb_state_array = 0 ; - while ( attempt_limit ) { + while ( attempt_limit ) { - int32_t hint_sb_id = -1 ; + int32_t hint_sb_id = -1 ; - if ( sb_id < 0 ) { + if ( sb_id < 0 ) { - // No superblock specified, try the hint for this block size + // No superblock specified, try the hint for this block size - sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr ); + sb_id = hint_sb_id = int32_t( *hint_sb_id_ptr ); + + sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); + } + + // Require: + // 0 <= sb_id + // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id + + if ( sb_state == ( state_header_mask & *sb_state_array ) ) { + + // This superblock state is as expected, for the moment. + // Attempt to claim a bit. The attempt updates the state + // so have already made sure the state header is as expected. + + const uint32_t count_lg2 = sb_state >> state_shift ; + const uint32_t mask = ( 1u << count_lg2 ) - 1 ; + + const Kokkos::pair result = + CB::acquire_bounded_lg2( sb_state_array + , count_lg2 + , block_id_hint & mask + , sb_state + ); + + // If result.first < 0 then failed to acquire + // due to either full or buffer was wrong state. + // Could be wrong state if a deallocation raced the + // superblock to empty before the acquire could succeed. + + if ( 0 <= result.first ) { // acquired a bit + + const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ; + + // Set the allocated block pointer + + p = ((char*)( m_sb_state_array + m_data_offset )) + + ( uint64_t(sb_id) << m_sb_size_lg2 ) // superblock memory + + ( uint64_t(result.first) << size_lg2 ); // block memory + +#if 0 + printf( " MemoryPool(0x%lx) pointer(0x%lx) allocate(%lu) sb_id(%d) sb_state(0x%x) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , alloc_size + , sb_id + , sb_state + , (1u << size_lg2) + , (1u << count_lg2) + , result.first + , result.second ); +#endif + + break ; // Success + } + } + //------------------------------------------------------------------ + // Arrive here if failed to acquire a block. + // Must find a new superblock. + + // Start searching at designated index for this block size. + // Look for superblock that, in preferential order, + // 1) part-full superblock of this block size + // 2) empty superblock to claim for this block size + // 3) part-full superblock of the next larger block size + + sb_state = block_state ; // Expect to find the desired state + sb_id = -1 ; + + bool update_hint = false ; + int32_t sb_id_empty = -1 ; + int32_t sb_id_large = -1 ; + uint32_t sb_state_large = 0 ; + + sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ; + + for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) { + + // Query state of the candidate superblock. + // Note that the state may change at any moment + // as concurrent allocations and deallocations occur. + + const uint32_t full_state = *sb_state_array ; + const uint32_t used = full_state & state_used_mask ; + const uint32_t state = full_state & state_header_mask ; + + if ( state == block_state ) { + + // Superblock is assigned to this block size + + if ( used < block_count ) { + + // There is room to allocate one block + + sb_id = id ; + + // Is there room to allocate more than one block? + + update_hint = used + 1 < block_count ; + + break ; + } + } + else if ( 0 == used ) { + + // Superblock is empty + + if ( -1 == sb_id_empty ) { + + // Superblock is not assigned to this block size + // and is the first empty superblock encountered. + // Save this id to use if a partfull superblock is not found. + + sb_id_empty = id ; + } + } + else if ( ( -1 == sb_id_empty /* have not found an empty */ ) && + ( -1 == sb_id_large /* have not found a larger */ ) && + ( state < block_state /* a larger block */ ) && + // is not full: + ( used < ( 1u << ( state >> state_shift ) ) ) ) { + // First superblock encountered that is + // larger than this block size and + // has room for an allocation. + // Save this id to use of partfull or empty superblock not found + sb_id_large = id ; + sb_state_large = state ; + } + + // Iterate around the superblock array: + + if ( ++id < m_sb_count ) { + sb_state_array += m_sb_state_size ; + } + else { + id = 0 ; + sb_state_array = m_sb_state_array ; + } + } + + // printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large); + + if ( sb_id < 0 ) { + + // Did not find a partfull superblock for this block size. + + if ( 0 <= sb_id_empty ) { + + // Found first empty superblock following designated superblock + // Attempt to claim it for this block size. + // If the claim fails assume that another thread claimed it + // for this block size and try to use it anyway, + // but do not update hint. + + sb_id = sb_id_empty ; + + sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); + + // If successfully changed assignment of empty superblock 'sb_id' + // to this block_size then update the hint. + + const uint32_t state_empty = state_header_mask & *sb_state_array ; + + // If this thread claims the empty block then update the hint + update_hint = + state_empty == + Kokkos::atomic_compare_exchange + (sb_state_array,state_empty,block_state); + } + else if ( 0 <= sb_id_large ) { + + // Found a larger superblock with space available + + sb_id = sb_id_large ; + sb_state = sb_state_large ; sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); } - - // Require: - // 0 <= sb_id - // sb_state_array == m_sb_state_array + m_sb_state_size * sb_id - - if ( sb_state == ( state_header_mask & *sb_state_array ) ) { - - // This superblock state is as expected, for the moment. - // Attempt to claim a bit. The attempt updates the state - // so have already made sure the state header is as expected. - - const uint32_t count_lg2 = sb_state >> state_shift ; - const uint32_t mask = ( 1u << count_lg2 ) - 1 ; - - const Kokkos::pair result = - CB::acquire_bounded_lg2( sb_state_array - , count_lg2 - , block_id_hint & mask - , sb_state - ); - - // If result.first < 0 then failed to acquire - // due to either full or buffer was wrong state. - // Could be wrong state if a deallocation raced the - // superblock to empty before the acquire could succeed. - - if ( 0 <= result.first ) { // acquired a bit - - const uint32_t size_lg2 = m_sb_size_lg2 - count_lg2 ; - - // Set the allocated block pointer - - p = ((char*)( m_sb_state_array + m_data_offset )) - + ( uint32_t(sb_id) << m_sb_size_lg2 ) // superblock memory - + ( result.first << size_lg2 ); // block memory - - break ; // Success - } - -// printf(" acquire count_lg2(%d) sb_state(0x%x) sb_id(%d) result(%d,%d)\n" , count_lg2 , sb_state , sb_id , result.first , result.second ); - + else { + // Did not find a potentially usable superblock + --attempt_limit ; } - //------------------------------------------------------------------ - // Arrive here if failed to acquire a block. - // Must find a new superblock. + } - // Start searching at designated index for this block size. - // Look for superblock that, in preferential order, - // 1) part-full superblock of this block size - // 2) empty superblock to claim for this block size - // 3) part-full superblock of the next larger block size - - sb_state = block_state ; // Expect to find the desired state - sb_id = -1 ; - - bool update_hint = false ; - int32_t sb_id_empty = -1 ; - int32_t sb_id_large = -1 ; - uint32_t sb_state_large = 0 ; - - sb_state_array = m_sb_state_array + sb_id_begin * m_sb_state_size ; - - for ( int32_t i = 0 , id = sb_id_begin ; i < m_sb_count ; ++i ) { - - // Query state of the candidate superblock. - // Note that the state may change at any moment - // as concurrent allocations and deallocations occur. - - const uint32_t full_state = *sb_state_array ; - const uint32_t used = full_state & state_used_mask ; - const uint32_t state = full_state & state_header_mask ; - - if ( state == block_state ) { - - // Superblock is assigned to this block size - - if ( used < block_count ) { - - // There is room to allocate one block - - sb_id = id ; - - // Is there room to allocate more than one block? - - update_hint = used + 1 < block_count ; - - break ; - } - } - else if ( 0 == used ) { - - // Superblock is empty - - if ( -1 == sb_id_empty ) { - - // Superblock is not assigned to this block size - // and is the first empty superblock encountered. - // Save this id to use if a partfull superblock is not found. - - sb_id_empty = id ; - } - } - else if ( ( -1 == sb_id_empty /* have not found an empty */ ) && - ( -1 == sb_id_large /* have not found a larger */ ) && - ( state < block_state /* a larger block */ ) && - // is not full: - ( used < ( 1u << ( state >> state_shift ) ) ) ) { - // First superblock encountered that is - // larger than this block size and - // has room for an allocation. - // Save this id to use of partfull or empty superblock not found - sb_id_large = id ; - sb_state_large = state ; - } - - // Iterate around the superblock array: - - if ( ++id < m_sb_count ) { - sb_state_array += m_sb_state_size ; - } - else { - id = 0 ; - sb_state_array = m_sb_state_array ; - } - } - -// printf(" search m_sb_count(%d) sb_id(%d) sb_id_empty(%d) sb_id_large(%d)\n" , m_sb_count , sb_id , sb_id_empty , sb_id_large); - - if ( sb_id < 0 ) { - - // Did not find a partfull superblock for this block size. - - if ( 0 <= sb_id_empty ) { - - // Found first empty superblock following designated superblock - // Attempt to claim it for this block size. - // If the claim fails assume that another thread claimed it - // for this block size and try to use it anyway, - // but do not update hint. - - sb_id = sb_id_empty ; - - sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); - - // If successfully changed assignment of empty superblock 'sb_id' - // to this block_size then update the hint. - - const uint32_t state_empty = state_header_mask & *sb_state_array ; - - // If this thread claims the empty block then update the hint - update_hint = - state_empty == - Kokkos::atomic_compare_exchange - (sb_state_array,state_empty,block_state); - } - else if ( 0 <= sb_id_large ) { - - // Found a larger superblock with space available - - sb_id = sb_id_large ; - sb_state = sb_state_large ; - - sb_state_array = m_sb_state_array + ( sb_id * m_sb_state_size ); - } - else { - // Did not find a potentially usable superblock - --attempt_limit ; - } - } - - if ( update_hint ) { - Kokkos::atomic_compare_exchange - ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); - } - } // end allocation attempt loop - - //-------------------------------------------------------------------- - } - else { - Kokkos::abort("Kokkos MemoryPool allocation request exceeded specified maximum allocation size"); - } + if ( update_hint ) { + Kokkos::atomic_compare_exchange + ( hint_sb_id_ptr , uint32_t(hint_sb_id) , uint32_t(sb_id) ); + } + } // end allocation attempt loop + //-------------------------------------------------------------------- return p ; } @@ -765,7 +807,7 @@ public: const uint32_t block_size_lg2 = m_sb_size_lg2 - ( block_state >> state_shift ); - ok_block_aligned = 0 == ( d & ( ( 1 << block_size_lg2 ) - 1 ) ); + ok_block_aligned = 0 == ( d & ( ( 1UL << block_size_lg2 ) - 1 ) ); if ( ok_block_aligned ) { @@ -773,31 +815,70 @@ public: // mask into superblock and then shift down for block index const uint32_t bit = - ( d & ( ptrdiff_t( 1 << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ; + ( d & ( ptrdiff_t( 1LU << m_sb_size_lg2 ) - 1 ) ) >> block_size_lg2 ; const int result = CB::release( sb_state_array , bit , block_state ); ok_dealloc_once = 0 <= result ; -// printf(" deallocate from sb_id(%d) result(%d) bit(%d) state(0x%x)\n" -// , sb_id -// , result -// , uint32_t(d >> block_size_lg2) -// , *sb_state_array ); - +#if 0 + printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate sb_id(%d) block_size(%d) block_capacity(%d) block_id(%d) block_claimed(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , sb_id + , (1u << block_size_lg2) + , (1u << (m_sb_size_lg2 - block_size_lg2)) + , bit + , result ); +#endif } } if ( ! ok_contains || ! ok_block_aligned || ! ok_dealloc_once ) { #if 0 - printf("Kokkos MemoryPool deallocate(0x%lx) contains(%d) block_aligned(%d) dealloc_once(%d)\n",(uintptr_t)p,ok_contains,ok_block_aligned,ok_dealloc_once); + printf( " MemoryPool(0x%lx) pointer(0x%lx) deallocate ok_contains(%d) ok_block_aligned(%d) ok_dealloc_once(%d)\n" + , (uintptr_t)m_sb_state_array + , (uintptr_t)p + , int(ok_contains) + , int(ok_block_aligned) + , int(ok_dealloc_once) ); #endif Kokkos::abort("Kokkos MemoryPool::deallocate given erroneous pointer"); } } // end deallocate //-------------------------------------------------------------------------- + + KOKKOS_INLINE_FUNCTION + int number_of_superblocks() const noexcept { return m_sb_count ; } + + KOKKOS_INLINE_FUNCTION + void superblock_state( int sb_id + , int & block_size + , int & block_count_capacity + , int & block_count_used ) const noexcept + { + block_size = 0 ; + block_count_capacity = 0 ; + block_count_used = 0 ; + + if ( Kokkos::Impl::MemorySpaceAccess + < Kokkos::Impl::ActiveExecutionMemorySpace + , base_memory_space >::accessible ) { + // Can access the state array + + const uint32_t state = + ((uint32_t volatile *)m_sb_state_array)[sb_id*m_sb_state_size]; + + const uint32_t block_count_lg2 = state >> state_shift ; + const uint32_t block_used = state & state_used_mask ; + + block_size = 1LU << ( m_sb_size_lg2 - block_count_lg2 ); + block_count_capacity = 1LU << block_count_lg2 ; + block_count_used = block_used ; + } + } }; } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp index af9c8ea782..36170e4a8e 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryTraits.hpp @@ -97,26 +97,22 @@ typedef Kokkos::MemoryTraits< Kokkos::Unmanaged | Kokkos::RandomAccess > MemoryR namespace Kokkos { namespace Impl { +static_assert( + ( 0 < int(KOKKOS_MEMORY_ALIGNMENT) ) && + ( 0 == ( int(KOKKOS_MEMORY_ALIGNMENT) & (int(KOKKOS_MEMORY_ALIGNMENT)-1))) , + "KOKKOS_MEMORY_ALIGNMENT must be a power of two" ); + /** \brief Memory alignment settings * * Sets global value for memory alignment. Must be a power of two! * Enable compatibility of views from different devices with static stride. * Use compiler flag to enable overwrites. */ -enum { MEMORY_ALIGNMENT = -#if defined( KOKKOS_MEMORY_ALIGNMENT ) - ( 1 << Kokkos::Impl::integral_power_of_two( KOKKOS_MEMORY_ALIGNMENT ) ) -#else - ( 1 << Kokkos::Impl::integral_power_of_two( 128 ) ) -#endif -#if defined( KOKKOS_MEMORY_ALIGNMENT_THRESHOLD ) +enum : unsigned + { MEMORY_ALIGNMENT = KOKKOS_MEMORY_ALIGNMENT , MEMORY_ALIGNMENT_THRESHOLD = KOKKOS_MEMORY_ALIGNMENT_THRESHOLD -#else - , MEMORY_ALIGNMENT_THRESHOLD = 4 -#endif }; - } //namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index a825fd54d3..7264ba7f38 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -204,6 +204,7 @@ struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static double min() {return DBL_MAX;} }; +#if !defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA ) template<> struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double sum() {return static_cast(0.0);} @@ -211,6 +212,7 @@ struct reduction_identity { KOKKOS_FORCEINLINE_FUNCTION constexpr static long double max() {return -LDBL_MAX;} KOKKOS_FORCEINLINE_FUNCTION constexpr static long double min() {return LDBL_MAX;} }; +#endif } diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index 067767f2f8..b6b1596883 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -78,7 +78,7 @@ struct pair /// This calls the default constructors of T1 and T2. It won't /// compile if those default constructors are not defined and /// public. - KOKKOS_FORCEINLINE_FUNCTION constexpr + KOKKOS_FUNCTION_DEFAULTED constexpr pair() = default ; /// \brief Constructor that takes both elements of the pair. @@ -458,7 +458,7 @@ struct pair first_type first; enum { second = 0 }; - KOKKOS_FORCEINLINE_FUNCTION constexpr + KOKKOS_FUNCTION_DEFAULTED constexpr pair() = default ; KOKKOS_FORCEINLINE_FUNCTION constexpr diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index fc8d6bec81..0ceae866c4 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -241,7 +241,7 @@ void parallel_for( const std::string & str std::cout << "KOKKOS_DEBUG Start parallel_for kernel: " << str << std::endl; #endif - parallel_for(policy,functor,str); + ::Kokkos::parallel_for(policy,functor,str); #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES Kokkos::fence(); @@ -487,7 +487,7 @@ void parallel_scan( const std::string& str std::cout << "KOKKOS_DEBUG Start parallel_scan kernel: " << str << std::endl; #endif - parallel_scan(policy,functor,str); + ::Kokkos::parallel_scan(policy,functor,str); #if KOKKOS_ENABLE_DEBUG_PRINT_KERNEL_NAMES Kokkos::fence(); diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp new file mode 100644 index 0000000000..b5e58507d6 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -0,0 +1,111 @@ +/* + //@HEADER + // ************************************************************************ + // + // Kokkos v. 2.0 + // Copyright (2014) Sandia Corporation + // + // Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, + // the U.S. Government retains certain rights in this software. + // + // Redistribution and use in source and binary forms, with or without + // modification, are permitted provided that the following conditions are + // met: + // + // 1. Redistributions of source code must retain the above copyright + // notice, this list of conditions and the following disclaimer. + // + // 2. Redistributions in binary form must reproduce the above copyright + // notice, this list of conditions and the following disclaimer in the + // documentation and/or other materials provided with the distribution. + // + // 3. Neither the name of the Corporation nor the names of the + // contributors may be used to endorse or promote products derived from + // this software without specific prior written permission. + // + // THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY + // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE + // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + // + // Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) + // + // ************************************************************************ + //@HEADER + */ + +#ifndef KOKKOSP_PROFILE_SECTION_HPP +#define KOKKOSP_PROFILE_SECTION_HPP + +#include +#include + +#include + +namespace Kokkos { +namespace Profiling { + +class ProfilingSection { + +public: + ProfilingSection(const std::string& sectionName) : + secName(sectionName) { + + #if defined( KOKKOS_ENABLE_PROFILING ) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::createProfileSection(secName, &secID); + } + #else + secID = 0; + #endif + } + + void start() { + #if defined( KOKKOS_ENABLE_PROFILING ) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::startSection(secID); + } + #endif + } + + void stop() { + #if defined( KOKKOS_ENABLE_PROFILING ) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::stopSection(secID); + } + #endif + } + + ~ProfilingSection() { + #if defined( KOKKOS_ENABLE_PROFILING ) + if(Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::destroyProfileSection(secID); + } + #endif + } + + std::string getName() { + return secName; + } + + uint32_t getSectionID() { + return secID; + } + +protected: + const std::string secName; + uint32_t secID; + +}; + +} +} + +#endif \ No newline at end of file diff --git a/lib/kokkos/core/src/Kokkos_ROCm.hpp b/lib/kokkos/core/src/Kokkos_ROCm.hpp index 0118d4667e..6effbb6090 100644 --- a/lib/kokkos/core/src/Kokkos_ROCm.hpp +++ b/lib/kokkos/core/src/Kokkos_ROCm.hpp @@ -204,8 +204,8 @@ struct VerifyExecutionCanAccessMemorySpace > { enum { value = false }; - inline static void verify( void ) { Experimental::ROCmSpace::access_error(); } - inline static void verify( const void * p ) { Experimental::ROCmSpace::access_error(p); } + inline static void verify( void ) { Kokkos::Experimental::ROCmSpace::access_error(); } + inline static void verify( const void * p ) { Kokkos::Experimental::ROCmSpace::access_error(p); } }; } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Serial.hpp b/lib/kokkos/core/src/Kokkos_Serial.hpp index 539761a1f9..e713461f8d 100644 --- a/lib/kokkos/core/src/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Kokkos_Serial.hpp @@ -145,7 +145,7 @@ public: unsigned use_cores_per_numa = 0 , bool allow_asynchronous_threadpool = false); - static int is_initialized(); + static bool is_initialized(); /** \brief Return the maximum amount of concurrency. */ static int concurrency() {return 1;}; @@ -424,11 +424,13 @@ private: typedef typename Policy::work_tag WorkTag ; typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; + typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; @@ -488,7 +490,7 @@ public: this-> template exec< WorkTag >( update ); - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >:: + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >:: final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } @@ -619,16 +621,16 @@ namespace Impl { template< class FunctorType , class ... Traits > class ParallelFor< FunctorType , - Kokkos::Experimental::MDRangePolicy< Traits ... > , + Kokkos::MDRangePolicy< Traits ... > , Kokkos::Serial > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; - typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; const FunctorType m_functor ; const MDRangePolicy m_mdr_policy ; @@ -661,32 +663,33 @@ public: template< class FunctorType , class ReducerType , class ... Traits > class ParallelReduce< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , ReducerType , Kokkos::Serial > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; typedef typename MDRangePolicy::work_tag WorkTag ; typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; typedef typename ReducerTypeFwd::value_type ValueType; typedef FunctorAnalysis< FunctorPatternInterface::REDUCE , Policy , FunctorType > Analysis ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy , FunctorType , WorkTag , ValueType @@ -735,7 +738,7 @@ public: this-> exec( update ); - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >:: + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >:: final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } @@ -878,8 +881,9 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; @@ -940,7 +944,7 @@ public: this-> template exec< WorkTag >( data , update ); - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >:: + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >:: final( ReducerConditional::select(m_functor , m_reducer) , ptr ); } diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 47b105cfdc..d4940b3412 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -408,7 +408,7 @@ view_alloc( Args const & ... args ) } template< class ... Args > -inline +KOKKOS_INLINE_FUNCTION Impl::ViewCtorProp< typename Impl::ViewCtorProp< void , Args >::type ... > view_wrap( Args const & ... args ) { @@ -1216,6 +1216,13 @@ public: m_track.assign_allocated_record_to_uninitialized( record ); } + KOKKOS_INLINE_FUNCTION + void assign_data( pointer_type arg_data ) + { + m_track.clear(); + m_map.assign_data( arg_data ); + } + // Wrap memory according to properties and array layout template< class ... P > explicit KOKKOS_INLINE_FUNCTION @@ -2235,6 +2242,29 @@ create_mirror_view(const Space& , const Kokkos::View & src return typename Impl::MirrorViewType::view_type(src.label(),src.layout()); } +// Create a mirror view and deep_copy in a new space (specialization for same space) +template +typename Impl::MirrorViewType::view_type +create_mirror_view_and_copy(const Space& , const Kokkos::View & src + , std::string const& name = "" + , typename std::enable_if::is_same_memspace>::type* = 0 ) { + (void)name; + return src; +} + +// Create a mirror view and deep_copy in a new space (specialization for different space) +template +typename Impl::MirrorViewType::view_type +create_mirror_view_and_copy(const Space& , const Kokkos::View & src + , std::string const& name = "" + , typename std::enable_if::is_same_memspace>::type* = 0 ) { + using Mirror = typename Impl::MirrorViewType::view_type; + std::string label = name.empty() ? src.label() : name; + auto mirror = Mirror(ViewAllocateWithoutInitializing(label), src.layout()); + deep_copy(mirror, src); + return mirror; +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- @@ -2432,6 +2462,7 @@ struct CommonViewAllocProp< void, ValueType > using scalar_array_type = ValueType; template < class ... Views > + KOKKOS_INLINE_FUNCTION CommonViewAllocProp( const Views & ... ) {} }; @@ -2499,6 +2530,7 @@ using DeducedCommonPropsType = typename Impl::DeduceCommonViewAllocProp +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( Views const & ... views ) { diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index 58b0f72f51..f9521a7e38 100644 --- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -46,205 +46,198 @@ namespace Kokkos { namespace Impl { -namespace Experimental { template< class functor_type , class execution_space, class ... policy_args > class WorkGraphExec; -}}} // namespace Kokkos::Impl::Experimental +}} // namespace Kokkos::Impl namespace Kokkos { -namespace Experimental { template< class ... Properties > class WorkGraphPolicy { public: - using self_type = WorkGraphPolicy; - using traits = Kokkos::Impl::PolicyTraits; - using index_type = typename traits::index_type; + using self_type = WorkGraphPolicy; + using traits = Kokkos::Impl::PolicyTraits; + using index_type = typename traits::index_type; + using member_type = index_type; + using work_tag = typename traits::work_tag; using execution_space = typename traits::execution_space; - using work_tag = typename traits::work_tag; - using memory_space = typename execution_space::memory_space; - using graph_type = Kokkos::Experimental::Crs; - using member_type = index_type; + using memory_space = typename execution_space::memory_space; + using graph_type = Kokkos::Crs; + + enum : std::int32_t { + END_TOKEN = -1 , + BEGIN_TOKEN = -2 , + COMPLETED_TOKEN = -3 }; private: - - graph_type m_graph; using ints_type = Kokkos::View; - using range_type = Kokkos::pair; - using ranges_type = Kokkos::View; - const std::int32_t m_total_work; - ints_type m_counts; - ints_type m_queue; - ranges_type m_ranges; + + // Let N = m_graph.numRows(), the total work + // m_queue[ 0 .. N-1] = the ready queue + // m_queue[ N .. 2*N-1] = the waiting queue counts + // m_queue[2*N .. 2*N+2] = the ready queue hints + + graph_type const m_graph; + ints_type m_queue ; + + KOKKOS_INLINE_FUNCTION + void push_work( const std::int32_t w ) const noexcept + { + const std::int32_t N = m_graph.numRows(); + + std::int32_t volatile * const ready_queue = & m_queue[0] ; + std::int32_t volatile * const end_hint = & m_queue[2*N+1] ; + + // Push work to end of queue + const std::int32_t j = atomic_fetch_add( end_hint , 1 ); + + if ( ( N <= j ) || + ( END_TOKEN != atomic_exchange(ready_queue+j,w) ) ) { + // ERROR: past the end of queue or did not replace END_TOKEN + Kokkos::abort("WorkGraphPolicy push_work error"); + } + + memory_fence(); + } public: - struct TagZeroRanges {}; + /**\brief Attempt to pop the work item at the head of the queue. + * + * Find entry 'i' such that + * ( m_queue[i] != BEGIN_TOKEN ) AND + * ( i == 0 OR m_queue[i-1] == BEGIN_TOKEN ) + * if found then + * increment begin hint + * return atomic_exchange( m_queue[i] , BEGIN_TOKEN ) + * else if i < total work + * return END_TOKEN + * else + * return COMPLETED_TOKEN + * + */ KOKKOS_INLINE_FUNCTION - void operator()(TagZeroRanges, std::int32_t i) const { - m_ranges[i] = range_type(0, 0); - } - void zero_ranges() { - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*this, policy_type(0, 1)); - closure.execute(); - execution_space::fence(); - } + std::int32_t pop_work() const noexcept + { + const std::int32_t N = m_graph.numRows(); - struct TagFillQueue {}; - KOKKOS_INLINE_FUNCTION - void operator()(TagFillQueue, std::int32_t i) const { - if (*((volatile std::int32_t*)(&m_counts(i))) == 0) push_work(i); - } - void fill_queue() { - using policy_type = RangePolicy; - using closure_type = Kokkos::Impl::ParallelFor; - const closure_type closure(*this, policy_type(0, m_total_work)); - closure.execute(); - execution_space::fence(); - } + std::int32_t volatile * const ready_queue = & m_queue[0] ; + std::int32_t volatile * const begin_hint = & m_queue[2*N] ; -private: + // begin hint is guaranteed to be less than or equal to + // actual begin location in the queue. - inline - void setup() { - if (m_graph.numRows() > std::numeric_limits::max()) { - Kokkos::abort("WorkGraphPolicy work must be indexable using int32_t"); - } - get_crs_transpose_counts(m_counts, m_graph); - m_queue = ints_type(ViewAllocateWithoutInitializing("queue"), m_total_work); - deep_copy(m_queue, std::int32_t(-1)); - m_ranges = ranges_type("ranges", 1); - fill_queue(); - } + for ( std::int32_t i = *begin_hint ; i < N ; ++i ) { - KOKKOS_INLINE_FUNCTION - std::int32_t pop_work() const { - range_type w(-1,-1); - while (true) { - const range_type w_new( w.first + 1 , w.second ); - w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); - if ( w.first < w.second ) { // there was work in the queue - if ( w_new.first == w.first + 1 && w_new.second == w.second ) { - // we got a work item - std::int32_t i; - // the push_work function may have incremented the end counter - // but not yet written the work index into the queue. - // wait until the entry is valid. - while ( -1 == ( i = *((volatile std::int32_t*)(&m_queue( w.first ))) ) ); - return i; - } // we got a work item - } else { // there was no work in the queue -#ifdef KOKKOS_DEBUG - if ( w_new.first == w.first + 1 && w_new.second == w.second ) { - Kokkos::abort("bug in pop_work"); + const std::int32_t w = ready_queue[i] ; + + if ( w == END_TOKEN ) { return END_TOKEN ; } + + if ( ( w != BEGIN_TOKEN ) && + ( w == atomic_compare_exchange(ready_queue+i,w,BEGIN_TOKEN) ) ) { + // Attempt to claim ready work index succeeded, + // update the hint and return work index + atomic_increment( begin_hint ); + return w ; } -#endif - if (w.first == m_total_work) { // all work is done - return -1; - } else { // need to wait for more work to be pushed - // take a guess that one work item will be pushed - // the key thing is we can't leave (w) alone, because - // otherwise the next compare_exchange may succeed in - // popping work from an empty queue - w.second++; - } - } // there was no work in the queue - } // while (true) - } + // arrive here when ready_queue[i] == BEGIN_TOKEN + } + + return COMPLETED_TOKEN ; + } + KOKKOS_INLINE_FUNCTION - void push_work(std::int32_t i) const { - range_type w(-1,-1); - while (true) { - const range_type w_new( w.first , w.second + 1 ); - // try to increment the end counter - w = atomic_compare_exchange( &m_ranges(0) , w , w_new ); - // stop trying if the increment was successful - if ( w.first == w_new.first && w.second + 1 == w_new.second ) break; + void completed_work( std::int32_t w ) const noexcept + { + Kokkos::memory_fence(); + + // Make sure the completed work function's memory accesses are flushed. + + const std::int32_t N = m_graph.numRows(); + + std::int32_t volatile * const count_queue = & m_queue[N] ; + + const std::int32_t B = m_graph.row_map(w); + const std::int32_t E = m_graph.row_map(w+1); + + for ( std::int32_t i = B ; i < E ; ++i ) { + const std::int32_t j = m_graph.entries(i); + if ( 1 == atomic_fetch_add(count_queue+j,-1) ) { + push_work(j); + } + } } - // write the work index into the claimed spot in the queue - *((volatile std::int32_t*)(&m_queue( w.second ))) = i; - // push this write out into the memory system - memory_fence(); - } - template< class functor_type , class execution_space, class ... policy_args > - friend class Kokkos::Impl::Experimental::WorkGraphExec; + struct TagInit {}; + struct TagCount {}; + struct TagReady {}; -public: + /**\brief Initialize queue + * + * m_queue[0..N-1] = END_TOKEN, the ready queue + * m_queue[N..2*N-1] = 0, the waiting count queue + * m_queue[2*N..2*N+1] = 0, begin/end hints for ready queue + */ + KOKKOS_INLINE_FUNCTION + void operator()( const TagInit , int i ) const noexcept + { m_queue[i] = i < m_graph.numRows() ? END_TOKEN : 0 ; } - WorkGraphPolicy(graph_type arg_graph) + KOKKOS_INLINE_FUNCTION + void operator()( const TagCount , int i ) const noexcept + { + std::int32_t volatile * const count_queue = + & m_queue[ m_graph.numRows() ] ; + + atomic_increment( count_queue + m_graph.entries[i] ); + } + + KOKKOS_INLINE_FUNCTION + void operator()( const TagReady , int w ) const noexcept + { + std::int32_t const * const count_queue = + & m_queue[ m_graph.numRows() ] ; + + if ( 0 == count_queue[w] ) push_work(w); + } + + WorkGraphPolicy( const graph_type & arg_graph ) : m_graph(arg_graph) - , m_total_work( arg_graph.numRows() ) + , m_queue( view_alloc( "queue" , WithoutInitializing ) + , arg_graph.numRows() * 2 + 2 ) { - setup(); - } + { // Initialize + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this, policy_type(0, m_queue.size())); + closure.execute(); + execution_space::fence(); + } -}; + { // execute-after counts + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this,policy_type(0,m_graph.entries.size())); + closure.execute(); + execution_space::fence(); + } -}} // namespace Kokkos::Experimental - -/*--------------------------------------------------------------------------*/ - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { -namespace Experimental { - -template< class functor_type , class execution_space, class ... policy_args > -class WorkGraphExec -{ - public: - - using self_type = WorkGraphExec< functor_type, execution_space, policy_args ... >; - using policy_type = Kokkos::Experimental::WorkGraphPolicy< policy_args ... >; - using member_type = typename policy_type::member_type; - using memory_space = typename execution_space::memory_space; - - protected: - - const functor_type m_functor; - const policy_type m_policy; - - protected: - - KOKKOS_INLINE_FUNCTION - std::int32_t before_work() const { - return m_policy.pop_work(); - } - - KOKKOS_INLINE_FUNCTION - void after_work(std::int32_t i) const { - /* fence any writes that were done by the work item itself - (usually writing its result to global memory) */ - memory_fence(); - const std::int32_t begin = m_policy.m_graph.row_map( i ); - const std::int32_t end = m_policy.m_graph.row_map( i + 1 ); - for (std::int32_t j = begin; j < end; ++j) { - const std::int32_t next = m_policy.m_graph.entries( j ); - const std::int32_t old_count = atomic_fetch_add( &(m_policy.m_counts(next)), -1 ); - if ( old_count == 1 ) m_policy.push_work( next ); + { // Scheduling ready tasks + using policy_type = RangePolicy; + using closure_type = Kokkos::Impl::ParallelFor; + const closure_type closure(*this,policy_type(0,m_graph.numRows())); + closure.execute(); + execution_space::fence(); } } - - inline - WorkGraphExec( const functor_type & arg_functor - , const policy_type & arg_policy ) - : m_functor( arg_functor ) - , m_policy( arg_policy ) - { - } }; -}}} // namespace Kokkos::Impl::Experimental +} // namespace Kokkos #ifdef KOKKOS_ENABLE_SERIAL #include "impl/Kokkos_Serial_WorkGraphPolicy.hpp" diff --git a/lib/kokkos/core/src/Makefile b/lib/kokkos/core/src/Makefile index a917cf1656..6ee5fec716 100644 --- a/lib/kokkos/core/src/Makefile +++ b/lib/kokkos/core/src/Makefile @@ -5,51 +5,44 @@ endif PREFIX ?= /usr/local/lib/kokkos -default: messages build-lib - echo "End Build" +default: build-lib ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) - CXX = $(KOKKOS_PATH)/bin/nvcc_wrapper + CXX ?= $(KOKKOS_PATH)/bin/nvcc_wrapper else - CXX = g++ + CXX ?= g++ endif -CXXFLAGS = -O3 +CXXFLAGS ?= -O3 LINK ?= $(CXX) LDFLAGS ?= include $(KOKKOS_PATH)/Makefile.kokkos - -PWD = $(shell pwd) - -KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) -KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) -KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) -KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) -KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp) +include $(KOKKOS_PATH)/core/src/Makefile.generate_header_lists +include $(KOKKOS_PATH)/core/src/Makefile.generate_build_files CONDITIONAL_COPIES = ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) CONDITIONAL_COPIES += copy-cuda endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp) CONDITIONAL_COPIES += copy-threads endif ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1) - KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp) CONDITIONAL_COPIES += copy-qthreads endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) CONDITIONAL_COPIES += copy-openmp endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + CONDITIONAL_COPIES += copy-rocm +endif + ifeq ($(KOKKOS_OS),CYGWIN) COPY_FLAG = -u endif @@ -66,104 +59,7 @@ else KOKKOS_DEBUG_CMAKE = ON endif -messages: - echo "Start Build" - -build-makefile-kokkos: - rm -f Makefile.kokkos - echo "#Global Settings used to generate this library" >> Makefile.kokkos - echo "KOKKOS_PATH = $(PREFIX)" >> Makefile.kokkos - echo "KOKKOS_DEVICES = $(KOKKOS_DEVICES)" >> Makefile.kokkos - echo "KOKKOS_ARCH = $(KOKKOS_ARCH)" >> Makefile.kokkos - echo "KOKKOS_DEBUG = $(KOKKOS_DEBUG)" >> Makefile.kokkos - echo "KOKKOS_USE_TPLS = $(KOKKOS_USE_TPLS)" >> Makefile.kokkos - echo "KOKKOS_CXX_STANDARD = $(KOKKOS_CXX_STANDARD)" >> Makefile.kokkos - echo "KOKKOS_OPTIONS = $(KOKKOS_OPTIONS)" >> Makefile.kokkos - echo "KOKKOS_CUDA_OPTIONS = $(KOKKOS_CUDA_OPTIONS)" >> Makefile.kokkos - echo "CXX ?= $(CXX)" >> Makefile.kokkos - echo "NVCC_WRAPPER ?= $(PREFIX)/bin/nvcc_wrapper" >> Makefile.kokkos - echo "" >> Makefile.kokkos - echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> Makefile.kokkos - echo "KOKKOS_HEADERS = $(KOKKOS_HEADERS)" >> Makefile.kokkos - echo "KOKKOS_SRC = $(KOKKOS_SRC)" >> Makefile.kokkos - echo "" >> Makefile.kokkos - echo "#Variables used in application Makefiles" >> Makefile.kokkos - echo "KOKKOS_OS = $(KOKKOS_OS)" >> Makefile.kokkos - echo "KOKKOS_CPP_DEPENDS = $(KOKKOS_CPP_DEPENDS)" >> Makefile.kokkos - echo "KOKKOS_CXXFLAGS = $(KOKKOS_CXXFLAGS)" >> Makefile.kokkos - echo "KOKKOS_CPPFLAGS = $(KOKKOS_CPPFLAGS)" >> Makefile.kokkos - echo "KOKKOS_LINK_DEPENDS = $(KOKKOS_LINK_DEPENDS)" >> Makefile.kokkos - echo "KOKKOS_LIBS = $(KOKKOS_LIBS)" >> Makefile.kokkos - echo "KOKKOS_LDFLAGS = $(KOKKOS_LDFLAGS)" >> Makefile.kokkos - echo "" >> Makefile.kokkos - echo "#Internal settings which need to propagated for Kokkos examples" >> Makefile.kokkos - echo "KOKKOS_INTERNAL_USE_CUDA = ${KOKKOS_INTERNAL_USE_CUDA}" >> Makefile.kokkos - echo "KOKKOS_INTERNAL_USE_QTHREADS = ${KOKKOS_INTERNAL_USE_QTHREADS}" >> Makefile.kokkos - echo "KOKKOS_INTERNAL_USE_OPENMP = ${KOKKOS_INTERNAL_USE_OPENMP}" >> Makefile.kokkos - echo "KOKKOS_INTERNAL_USE_PTHREADS = ${KOKKOS_INTERNAL_USE_PTHREADS}" >> Makefile.kokkos - echo "" >> Makefile.kokkos - echo "#Fake kokkos-clean target" >> Makefile.kokkos - echo "kokkos-clean:" >> Makefile.kokkos - echo "" >> Makefile.kokkos - sed \ - -e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \ - -e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \ - -e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \ - -e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \ - -e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \ - -e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' Makefile.kokkos \ - > Makefile.kokkos.tmp - mv -f Makefile.kokkos.tmp Makefile.kokkos - -build-cmake-kokkos: - rm -f kokkos.cmake - echo "#Global Settings used to generate this library" >> kokkos.cmake - echo "set(KOKKOS_PATH $(PREFIX) CACHE PATH \"Kokkos installation path\")" >> kokkos.cmake - echo "set(KOKKOS_DEVICES $(KOKKOS_DEVICES) CACHE STRING \"Kokkos devices list\")" >> kokkos.cmake - echo "set(KOKKOS_ARCH $(KOKKOS_ARCH) CACHE STRING \"Kokkos architecture flags\")" >> kokkos.cmake - echo "set(KOKKOS_DEBUG $(KOKKOS_DEBUG_CMAKE) CACHE BOOL \"Kokkos debug enabled ?)\")" >> kokkos.cmake - echo "set(KOKKOS_USE_TPLS $(KOKKOS_USE_TPLS) CACHE STRING \"Kokkos templates list\")" >> kokkos.cmake - echo "set(KOKKOS_CXX_STANDARD $(KOKKOS_CXX_STANDARD) CACHE STRING \"Kokkos C++ standard\")" >> kokkos.cmake - echo "set(KOKKOS_OPTIONS $(KOKKOS_OPTIONS) CACHE STRING \"Kokkos options\")" >> kokkos.cmake - echo "set(KOKKOS_CUDA_OPTIONS $(KOKKOS_CUDA_OPTIONS) CACHE STRING \"Kokkos Cuda options\")" >> kokkos.cmake - echo "if(NOT $ENV{CXX})" >> kokkos.cmake - echo ' message(WARNING "You are currently using compiler $${CMAKE_CXX_COMPILER} while Kokkos was built with $(CXX) ; make sure this is the behavior you intended to be.")' >> kokkos.cmake - echo "endif()" >> kokkos.cmake - echo "if(NOT DEFINED ENV{NVCC_WRAPPER})" >> kokkos.cmake - echo " set(NVCC_WRAPPER \"$(NVCC_WRAPPER)\" CACHE FILEPATH \"Path to command nvcc_wrapper\")" >> kokkos.cmake - echo "else()" >> kokkos.cmake - echo ' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")' >> kokkos.cmake - echo "endif()" >> kokkos.cmake - echo "" >> kokkos.cmake - echo "#Source and Header files of Kokkos relative to KOKKOS_PATH" >> kokkos.cmake - echo "set(KOKKOS_HEADERS \"$(KOKKOS_HEADERS)\" CACHE STRING \"Kokkos headers list\")" >> kokkos.cmake - echo "set(KOKKOS_SRC \"$(KOKKOS_SRC)\" CACHE STRING \"Kokkos source list\")" >> kokkos.cmake - echo "" >> kokkos.cmake - echo "#Variables used in application Makefiles" >> kokkos.cmake - echo "set(KOKKOS_CPP_DEPENDS \"$(KOKKOS_CPP_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_CXXFLAGS \"$(KOKKOS_CXXFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_CPPFLAGS \"$(KOKKOS_CPPFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_LINK_DEPENDS \"$(KOKKOS_LINK_DEPENDS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_LIBS \"$(KOKKOS_LIBS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_LDFLAGS \"$(KOKKOS_LDFLAGS)\" CACHE STRING \"\")" >> kokkos.cmake - echo "" >> kokkos.cmake - echo "#Internal settings which need to propagated for Kokkos examples" >> kokkos.cmake - echo "set(KOKKOS_INTERNAL_USE_CUDA \"${KOKKOS_INTERNAL_USE_CUDA}\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_INTERNAL_USE_OPENMP \"${KOKKOS_INTERNAL_USE_OPENMP}\" CACHE STRING \"\")" >> kokkos.cmake - echo "set(KOKKOS_INTERNAL_USE_PTHREADS \"${KOKKOS_INTERNAL_USE_PTHREADS}\" CACHE STRING \"\")" >> kokkos.cmake - echo "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)" >> kokkos.cmake - echo "" >> kokkos.cmake - sed \ - -e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \ - -e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \ - -e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \ - -e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \ - -e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \ - -e 's|= KokkosCore_config.h|= $(PREFIX)/include/KokkosCore_config.h|g' kokkos.cmake \ - > kokkos.cmake.tmp - mv -f kokkos.cmake.tmp kokkos.cmake - -build-lib: build-makefile-kokkos build-cmake-kokkos $(KOKKOS_LINK_DEPENDS) +build-lib: $(KOKKOS_LINK_DEPENDS) mkdir: mkdir -p $(PREFIX) @@ -188,14 +84,18 @@ copy-openmp: mkdir mkdir -p $(PREFIX)/include/OpenMP cp $(COPY_FLAG) $(KOKKOS_HEADERS_OPENMP) $(PREFIX)/include/OpenMP -install: mkdir $(CONDITIONAL_COPIES) build-lib +copy-rocm: mkdir + mkdir -p $(PREFIX)/include/ROCm + cp $(COPY_FLAG) $(KOKKOS_HEADERS_ROCM) $(PREFIX)/include/ROCm + +install: mkdir $(CONDITIONAL_COPIES) build-lib generate_build_settings cp $(COPY_FLAG) $(NVCC_WRAPPER) $(PREFIX)/bin cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE) $(PREFIX)/include cp $(COPY_FLAG) $(KOKKOS_HEADERS_INCLUDE_IMPL) $(PREFIX)/include/impl - cp $(COPY_FLAG) Makefile.kokkos $(PREFIX) - cp $(COPY_FLAG) kokkos.cmake $(PREFIX) + cp $(COPY_FLAG) $(KOKKOS_MAKEFILE) $(PREFIX) + cp $(COPY_FLAG) $(KOKKOS_CMAKEFILE) $(PREFIX) cp $(COPY_FLAG) libkokkos.a $(PREFIX)/lib - cp $(COPY_FLAG) KokkosCore_config.h $(PREFIX)/include + cp $(COPY_FLAG) $(KOKKOS_CONFIG_HEADER) $(PREFIX)/include clean: kokkos-clean - rm -f Makefile.kokkos + rm -f $(KOKKOS_MAKEFILE) $(KOKKOS_CMAKEFILE) diff --git a/lib/kokkos/core/src/Makefile.generate_build_files b/lib/kokkos/core/src/Makefile.generate_build_files new file mode 100644 index 0000000000..d55967f84f --- /dev/null +++ b/lib/kokkos/core/src/Makefile.generate_build_files @@ -0,0 +1,100 @@ +# This file is responsible for generating files which will be used +# by build system (make and cmake) in scenarios where the kokkos library +# gets installed before building the app + +# These files are generated by this makefile +KOKKOS_MAKEFILE=Makefile.kokkos +KOKKOS_CMAKEFILE=kokkos_generated_settings.cmake + +ifeq ($(KOKKOS_DEBUG),"no") + KOKKOS_DEBUG_CMAKE = OFF +else + KOKKOS_DEBUG_CMAKE = ON +endif + +# Functions for generating makefile and cmake file +# In calling these routines, do not put space after , +# e.g., $(call kokkos_append_var,KOKKOS_PATH,$(PREFIX)) +kokkos_append_makefile = echo $1 >> $(KOKKOS_MAKEFILE) +kokkos_append_cmakefile = echo $1 >> $(KOKKOS_CMAKEFILE) + +kokkos_setvar_cmakefile = echo set\($1 $2\) >> $(KOKKOS_CMAKEFILE) +kokkos_setlist_cmakefile = echo set\($1 \"$2\"\) >> $(KOKKOS_CMAKEFILE) + +kokkos_appendvar_makefile = echo $1 = $($(1)) >> $(KOKKOS_MAKEFILE) +kokkos_appendvar2_makefile = echo $1 ?= $($(1)) >> $(KOKKOS_MAKEFILE) +kokkos_appendvar_cmakefile = echo set\($1 $($(1)) CACHE $2 FORCE\) >> $(KOKKOS_CMAKEFILE) +kokkos_appendval_makefile = echo $1 = $2 >> $(KOKKOS_MAKEFILE) +kokkos_appendval_cmakefile = echo set\($1 $2 CACHE $3 FORCE\) >> $(KOKKOS_CMAKEFILE) + +kokkos_append_string = $(call kokkos_append_makefile,$1); $(call kokkos_append_cmakefile,$1) +kokkos_append_var = $(call kokkos_appendvar_makefile,$1); $(call kokkos_appendvar_cmakefile,$1,$2) +kokkos_append_var2 = $(call kokkos_appendvar2_makefile,$1); $(call kokkos_appendvar_cmakefile,$1,$2) +kokkos_append_varval = $(call kokkos_appendval_makefile,$1,$2); $(call kokkos_appendval_cmakefile,$1,$2,$3) + +generate_build_settings: $(KOKKOS_CONFIG_HEADER) + @rm -f $(KOKKOS_MAKEFILE) + @rm -f $(KOKKOS_CMAKEFILE) + @$(call kokkos_append_string, "#Global Settings used to generate this library") + @$(call kokkos_append_varval,KOKKOS_PATH,$(KOKKOS_INSTALL_PATH),'FILEPATH "Kokkos installation path"') + @$(call kokkos_append_var,KOKKOS_DEVICES,'STRING "Kokkos devices list"') + @$(call kokkos_append_var,KOKKOS_ARCH,'STRING "Kokkos architecture flags"') + @$(call kokkos_appendvar_makefile,KOKKOS_DEBUG) + @$(call kokkos_appendvar_cmakefile,KOKKOS_DEBUG_CMAKE,'BOOL "Kokkos debug enabled ?"') + @$(call kokkos_append_var,KOKKOS_USE_TPLS,'STRING "Kokkos templates list"') + @$(call kokkos_append_var,KOKKOS_CXX_STANDARD,'STRING "Kokkos C++ standard"') + @$(call kokkos_append_var,KOKKOS_OPTIONS,'STRING "Kokkos options"') + @$(call kokkos_append_var,KOKKOS_CUDA_OPTIONS,'STRING "Kokkos Cuda options"') + @$(call kokkos_appendvar2,CXX,'KOKKOS C++ Compiler') + @$(call kokkos_append_cmakefile,"if(NOT DEFINED ENV{NVCC_WRAPPER})") + @$(call kokkos_append_var2,NVCC_WRAPPER,'FILEPATH "Path to command nvcc_wrapper"') + @$(call kokkos_append_cmakefile,"else()") + @$(call kokkos_append_cmakefile,' set(NVCC_WRAPPER $$ENV{NVCC_WRAPPER} CACHE FILEPATH "Path to command nvcc_wrapper")') + @$(call kokkos_append_cmakefile,"endif()") + @$(call kokkos_append_string,"") + @$(call kokkos_append_string,"#Source and Header files of Kokkos relative to KOKKOS_PATH") + @$(call kokkos_append_var,KOKKOS_HEADERS,'STRING "Kokkos headers list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_IMPL,'STRING "Kokkos headers impl list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_CUDA,'STRING "Kokkos headers Cuda list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_OPENMP,'STRING "Kokkos headers OpenMP list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_ROCM,'STRING "Kokkos headers ROCm list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_THREADS,'STRING "Kokkos headers Threads list"') + @$(call kokkos_append_var,KOKKOS_HEADERS_QTHREADS,'STRING "Kokkos headers QThreads list"') + @$(call kokkos_append_var,KOKKOS_SRC,'STRING "Kokkos source list"') + @$(call kokkos_append_string,"") + @$(call kokkos_append_string,"#Variables used in application Makefiles") + @$(call kokkos_append_var,KOKKOS_OS,'STRING ""') # This was not in original cmake gen + @$(call kokkos_append_var,KOKKOS_CPP_DEPENDS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_LINK_DEPENDS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_CXXFLAGS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_CPPFLAGS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_LDFLAGS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_LIBS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_EXTRA_LIBS,'STRING ""') + @$(call kokkos_append_string,"") + @$(call kokkos_append_string,"#Internal settings which need to propagated for Kokkos examples") + @$(call kokkos_append_var,KOKKOS_INTERNAL_USE_CUDA,'STRING ""') + @$(call kokkos_append_var,KOKKOS_INTERNAL_USE_OPENMP,'STRING ""') + @$(call kokkos_append_var,KOKKOS_INTERNAL_USE_PTHREADS,'STRING ""') + @$(call kokkos_append_var,KOKKOS_INTERNAL_USE_ROCM,'STRING ""') + @$(call kokkos_append_var,KOKKOS_INTERNAL_USE_QTHREADS,'STRING ""') # Not in original cmake gen + @$(call kokkos_append_cmakefile "mark_as_advanced(KOKKOS_HEADERS KOKKOS_SRC KOKKOS_INTERNAL_USE_CUDA KOKKOS_INTERNAL_USE_OPENMP KOKKOS_INTERNAL_USE_PTHREADS)") + @$(call kokkos_append_makefile,"") + @$(call kokkos_append_makefile,"#Fake kokkos-clean target") + @$(call kokkos_append_makefile,"kokkos-clean:") + @$(call kokkos_append_makefile,"") + @sed \ + -e 's|$(KOKKOS_PATH)/core/src|$(PREFIX)/include|g' \ + -e 's|$(KOKKOS_PATH)/containers/src|$(PREFIX)/include|g' \ + -e 's|$(KOKKOS_PATH)/algorithms/src|$(PREFIX)/include|g' \ + -e 's|-L$(PWD)|-L$(PREFIX)/lib|g' \ + -e 's|= libkokkos.a|= $(PREFIX)/lib/libkokkos.a|g' \ + -e 's|= $(KOKKOS_CONFIG_HEADER)|= $(PREFIX)/include/$(KOKKOS_CONFIG_HEADER)|g' $(KOKKOS_MAKEFILE) \ + > $(KOKKOS_MAKEFILE).tmp + @mv -f $(KOKKOS_MAKEFILE).tmp $(KOKKOS_MAKEFILE) + @$(call kokkos_setvar_cmakefile,KOKKOS_CXX_FLAGS,$(KOKKOS_CXXFLAGS)) + @$(call kokkos_setvar_cmakefile,KOKKOS_CPP_FLAGS,$(KOKKOS_CPPFLAGS)) + @$(call kokkos_setvar_cmakefile,KOKKOS_LD_FLAGS,$(KOKKOS_LDFLAGS)) + @$(call kokkos_setlist_cmakefile,KOKKOS_LIBS_LIST,$(KOKKOS_LIBS)) + @$(call kokkos_setlist_cmakefile,KOKKOS_EXTRA_LIBS_LIST,$(KOKKOS_EXTRA_LIBS)) + diff --git a/lib/kokkos/core/src/Makefile.generate_header_lists b/lib/kokkos/core/src/Makefile.generate_header_lists new file mode 100644 index 0000000000..cd308bf8f4 --- /dev/null +++ b/lib/kokkos/core/src/Makefile.generate_header_lists @@ -0,0 +1,28 @@ +# Build a List of Header Files + +KOKKOS_HEADERS_INCLUDE = $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS_INCLUDE_IMPL = $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) +KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) +KOKKOS_HEADERS_INCLUDE_IMPL += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) +KOKKOS_HEADERS_INCLUDE += $(wildcard $(KOKKOS_PATH)/algorithms/src/*.hpp) + +ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) + KOKKOS_HEADERS_CUDA += $(wildcard $(KOKKOS_PATH)/core/src/Cuda/*.hpp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) + KOKKOS_HEADERS_THREADS += $(wildcard $(KOKKOS_PATH)/core/src/Threads/*.hpp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_QTHREADS), 1) + KOKKOS_HEADERS_QTHREADS += $(wildcard $(KOKKOS_PATH)/core/src/Qthreads/*.hpp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) + KOKKOS_HEADERS_OPENMP += $(wildcard $(KOKKOS_PATH)/core/src/OpenMP/*.hpp) +endif + +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + KOKKOS_HEADERS_ROCM += $(wildcard $(KOKKOS_PATH)/core/src/ROCm/*.hpp) +endif + diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp index 915fbe52c1..ed19a248a6 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Exec.cpp @@ -294,7 +294,7 @@ void OpenMP::initialize( int thread_count ) } { - if (nullptr == std::getenv("OMP_PROC_BIND") ) { + if ( Kokkos::show_warnings() && nullptr == std::getenv("OMP_PROC_BIND") ) { printf("Kokkos::OpenMP::initialize WARNING: OMP_PROC_BIND environment variable not set\n"); printf(" In general, for best performance with OpenMP 4.0 or better set OMP_PROC_BIND=spread and OMP_PLACES=threads\n"); printf(" For best performance with OpenMP 3.1 set OMP_PROC_BIND=true\n"); @@ -327,7 +327,7 @@ void OpenMP::initialize( int thread_count ) omp_set_num_threads(Impl::g_openmp_hardware_max_threads); } else { - if( thread_count > process_num_threads ) { + if( Kokkos::show_warnings() && thread_count > process_num_threads ) { printf( "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores.\n"); printf( " process threads available : %3d, requested thread : %3d\n", process_num_threads, thread_count ); } @@ -364,12 +364,12 @@ void OpenMP::initialize( int thread_count ) // Check for over-subscription - //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { - // std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; - // std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; - // std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; - // std::cout << " Requested: " << thread_count << " threads per process." << std::endl; - //} + if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) { + std::cout << "Kokkos::OpenMP::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; + std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; + std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; + std::cout << " Requested: " << thread_count << " threads per process." << std::endl; + } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp index b54abb0068..70115b4728 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Parallel.hpp @@ -170,20 +170,20 @@ public: // MDRangePolicy impl template< class FunctorType , class ... Traits > class ParallelFor< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , Kokkos::OpenMP > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; typedef typename MDRangePolicy::work_tag WorkTag ; typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; OpenMPExec * m_instance ; const FunctorType m_functor ; @@ -292,11 +292,12 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; // Static Assert WorkTag void if ReducerType not InvalidType - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTagFwd > ValueJoin ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; @@ -393,7 +394,7 @@ public: , m_instance->get_thread_data(i)->pool_reduce_local() ); } - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) ); @@ -445,14 +446,14 @@ public: // MDRangePolicy impl template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ...> + , Kokkos::MDRangePolicy< Traits ...> , ReducerType , Kokkos::OpenMP > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; typedef typename MDRangePolicy::work_tag WorkTag ; @@ -463,16 +464,17 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; typedef typename ReducerTypeFwd::value_type ValueType; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTagFwd > ValueJoin ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy , FunctorType , WorkTag , ValueType @@ -558,7 +560,7 @@ public: , m_instance->get_thread_data(i)->pool_reduce_local() ); } - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) ); @@ -920,9 +922,10 @@ private: , FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTagFwd > ValueJoin ; typedef typename Analysis::pointer_type pointer_type ; typedef typename Analysis::reference_type reference_type ; @@ -1067,7 +1070,7 @@ public: , m_instance->get_thread_data(i)->pool_reduce_local() ); } - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , ptr ); if ( m_result_ptr ) { const int n = Analysis::value_count( ReducerConditional::select(m_functor , m_reducer) ); diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp index 289ad15451..540b91a52a 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_WorkGraphPolicy.hpp @@ -49,33 +49,26 @@ namespace Impl { template< class FunctorType , class ... Traits > class ParallelFor< FunctorType , - Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::WorkGraphPolicy< Traits ... > , Kokkos::OpenMP > - : public Kokkos::Impl::Experimental:: - WorkGraphExec< FunctorType, - Kokkos::OpenMP, - Traits ... - > { private: - typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; - typedef Kokkos::Impl::Experimental:: - WorkGraphExec Base ; + typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ; + + Policy m_policy ; + FunctorType m_functor ; template< class TagType > typename std::enable_if< std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - Base::m_functor( i ); - } + exec_one( const std::int32_t w ) const noexcept + { m_functor( w ); } template< class TagType > typename std::enable_if< ! std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - const TagType t{} ; - Base::m_functor( t , i ); - } + exec_one( const std::int32_t w ) const noexcept + { const TagType t{} ; m_functor( t , w ); } public: @@ -86,9 +79,15 @@ public: #pragma omp parallel num_threads(pool_size) { - for (std::int32_t i; (-1 != (i = Base::before_work())); ) { - exec_one< typename Policy::work_tag >( i ); - Base::after_work(i); + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for ( std::int32_t w = Policy::END_TOKEN ; + Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) { + if ( Policy::END_TOKEN != w ) { + exec_one< typename Policy::work_tag >( w ); + m_policy.completed_work(w); + } } } } @@ -96,12 +95,13 @@ public: inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : Base( arg_functor, arg_policy ) - { - } + : m_policy( arg_policy ) + , m_functor( arg_functor ) + {} }; } // namespace Impl } // namespace Kokkos #endif /* #define KOKKOS_OPENMP_WORKGRAPHPOLICY_HPP */ + diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index b624384e7c..035ee2e7a6 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -248,12 +248,13 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; // Static Assert WorkTag void if ReducerType not InvalidType - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd, WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTagFwd > ValueJoin ; enum {HasJoin = ReduceFunctorHasJoin::value }; enum {UseReducer = is_reducer_type::value }; @@ -620,10 +621,11 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTag > ValueJoin ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , WorkTagFwd > ValueJoin ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; diff --git a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp index b45c7114a3..6c94319004 100644 --- a/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp +++ b/lib/kokkos/core/src/Qthreads/Kokkos_Qthreads_Parallel.hpp @@ -150,11 +150,12 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType > ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void >::type WorkTagFwd; // Static Assert WorkTag void if ReducerType not InvalidType - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; @@ -213,7 +214,7 @@ public: const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result(); - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer) , data ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer) , data ); if ( m_result_ptr ) { const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); @@ -331,9 +332,10 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void >::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; @@ -394,7 +396,7 @@ public: const pointer_type data = (pointer_type) QthreadsExec::exec_all_reduce_result(); - Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTag >::final( ReducerConditional::select(m_functor , m_reducer), data ); + Kokkos::Impl::FunctorFinal< ReducerTypeFwd , WorkTagFwd >::final( ReducerConditional::select(m_functor , m_reducer), data ); if ( m_result_ptr ) { const unsigned n = ValueTraits::value_count( ReducerConditional::select(m_functor , m_reducer) ); diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp index a93f488203..3c7586e264 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Atomic.hpp @@ -125,7 +125,7 @@ namespace Kokkos { oldval.t = *dest ; assume.i = oldval.i ; newval.t = val ; - atomic_compare_exchange( reinterpret_cast(dest) , assume.i, newval.i ); + atomic_compare_exchange( (int*)(dest) , assume.i, newval.i ); return oldval.t ; } diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp index 1322391d92..ffb129cb86 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Impl.cpp @@ -608,6 +608,7 @@ ROCmInternal::scratch_space( const Kokkos::Experimental::ROCm::size_type size ) void ROCmInternal::finalize() { + Kokkos::Impl::rocm_device_synchronize(); was_finalized = 1; if ( 0 != m_scratchSpace || 0 != m_scratchFlags ) { diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp index 14ab52a1c2..04f4754db2 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Parallel.hpp @@ -277,7 +277,7 @@ public: this->team_barrier(); value = local_value; } -// Reduce accross a team of threads. +// Reduce across a team of threads. // // Each thread has vector_length elements. // This reduction is for TeamThreadRange operations, where the range @@ -354,6 +354,80 @@ public: return buffer[0]; } +// Reduce across a team of threads, with a reducer data type +// +// Each thread has vector_length elements. +// This reduction is for TeamThreadRange operations, where the range +// is spread across threads. Effectively, there are vector_length +// independent reduction operations. +// This is different from a reduction across the elements of a thread, +// which reduces every vector element. + + template< class ReducerType > + KOKKOS_INLINE_FUNCTION + typename std::enable_if< is_reducer< ReducerType >::value >::type + team_reduce( const ReducerType & reducer) const + { + typedef typename ReducerType::value_type value_type ; + + tile_static value_type buffer[512]; + const auto local = lindex(); + const auto team = team_rank(); + auto vector_rank = local%m_vector_length; + auto thread_base = team*m_vector_length; + + const std::size_t size = next_pow_2(m_team_size+1)/2; +#if defined(ROCM15) + buffer[local] = reducer.reference(); +#else + // ROCM 1.5 handles address spaces better, previous version didn't + lds_for(buffer[local], [&](ValueType& x) + { + x = value; + }); +#endif + m_idx.barrier.wait(); + + for(std::size_t s = 1; s < size; s *= 2) + { + const std::size_t index = 2 * s * team; + if (index < size) + { +#if defined(ROCM15) + reducer.join(buffer[vector_rank+index*m_vector_length], + buffer[vector_rank+(index+s)*m_vector_length]); +#else + lds_for(buffer[vector_rank+index*m_vector_length], [&](ValueType& x) + { + lds_for(buffer[vector_rank+(index+s)*m_vector_length], + [&](ValueType& y) + { + reducer.join(x, y); + }); + }); +#endif + } + m_idx.barrier.wait(); + } + + if (local == 0) + { + for(int i=size*m_vector_length; i + KOKKOS_INLINE_FUNCTION static + typename std::enable_if< is_reducer< ReducerType >::value >::type + vector_reduce( ReducerType const & reducer ) + { + #ifdef __HCC_ACCELERATOR__ + if(blockDim_x == 1) return; + + // Intra vector lane shuffle reduction: + typename ReducerType::value_type tmp ( reducer.reference() ); + + for ( int i = blockDim_x ; ( i >>= 1 ) ; ) { + shfl_down( reducer.reference() , i , blockDim_x ); + if ( (int)threadIdx_x < i ) { reducer.join( tmp , reducer.reference() ); } + } + + // Broadcast from root lane to all other lanes. + // Cannot use "butterfly" algorithm to avoid the broadcast + // because floating point summation is not associative + // and thus different threads could have different results. + + shfl( reducer.reference() , 0 , blockDim_x ); + #endif + } + + + /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. * @@ -1075,6 +1176,22 @@ void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct()); } +/** \brief Inter-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all threads of the the calling thread team and a summation of + * val is performed and put into result. This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ReducerType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct& loop_boundaries, + const Lambda & lambda, ReducerType const & reducer) { + reducer.init( reducer.reference() ); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,reducer.reference()); + } + loop_boundaries.thread.team_reduce(reducer); +} + /** \brief Intra-thread thread range parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. * * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of @@ -1161,6 +1278,41 @@ void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, const Lambda & lambda, ReducerType const & reducer) { + reducer.init( reducer.reference() ); + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,reducer.reference()); + } + loop_boundaries.thread.vector_reduce(reducer); +} +/** \brief Intra-thread vector parallel_reduce. Executes lambda(iType i, ValueType & val) for each i=0..N-1. + * + * The range i=0..N-1 is mapped to all vector lanes of the the calling thread and a reduction of + * val is performed using JoinType(ValueType& val, const ValueType& update) and put into init_result. + * The input value of init_result is used as initializer for temporary variables of ValueType. Therefore + * the input value should be the neutral element with respect to the join operation (e.g. '0 for +-' or + * '1 for *'). This functionality requires C++11 support.*/ +template< typename iType, class Lambda, typename ReducerType, class JoinType > +KOKKOS_INLINE_FUNCTION +void parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, const Lambda & lambda, const JoinType& join, ReducerType const & reducer) { + + for( iType i = loop_boundaries.start; i < loop_boundaries.end; i+=loop_boundaries.increment) { + lambda(i,reducer.reference()); + loop_boundaries.thread.team_barrier(); + } + reducer.reference() = loop_boundaries.thread.thread_reduce(reducer.reference(),join); +} + /** \brief Intra-thread vector parallel exclusive prefix sum. Executes lambda(iType i, ValueType & val, bool final) * for each i=0..N-1. * diff --git a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp index f2674e5929..4e96aa6eaf 100644 --- a/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp +++ b/lib/kokkos/core/src/ROCm/Kokkos_ROCm_Reduce.hpp @@ -102,11 +102,12 @@ void reduce_enqueue( typedef Kokkos::Impl::if_c< std::is_same::value, F, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, Tag, void >::type TagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , Tag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , Tag > ValueInit ; - typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , Tag > ValueJoin ; - typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , Tag > ValueFinal ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , TagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , TagFwd > ValueInit ; + typedef Kokkos::Impl::FunctorValueJoin< ReducerTypeFwd , TagFwd > ValueJoin ; + typedef Kokkos::Impl::FunctorFinal< ReducerTypeFwd , TagFwd > ValueFinal ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp index 35b2163ae5..977ada214e 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.cpp @@ -266,7 +266,7 @@ void ThreadsExec::execute_sleep( ThreadsExec & exec , const void * ) const int rank_rev = exec.m_pool_size - ( exec.m_pool_rank + 1 ); for ( int i = 0 ; i < n ; ++i ) { - Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( exec.m_pool_base[ rank_rev + (1<m_pool_state , ThreadsExec::Active ); } exec.m_pool_state = ThreadsExec::Inactive ; @@ -310,7 +310,7 @@ void ThreadsExec::fence() { if ( s_thread_pool_size[0] ) { // Wait for the root thread to complete: - Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( s_threads_exec[0]->m_pool_state , ThreadsExec::Active ); } s_current_function = 0 ; @@ -716,12 +716,12 @@ void ThreadsExec::initialize( unsigned thread_count , } // Check for over-subscription - //if( Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node() ) { - // std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; - // std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; - // std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; - // std::cout << " Requested: " << thread_count << " threads per process." << std::endl; - //} + if( Kokkos::show_warnings() && (Impl::mpi_ranks_per_node() * long(thread_count) > Impl::processors_per_node()) ) { + std::cout << "Kokkos::Threads::initialize WARNING: You are likely oversubscribing your CPU cores." << std::endl; + std::cout << " Detected: " << Impl::processors_per_node() << " cores per node." << std::endl; + std::cout << " Detected: " << Impl::mpi_ranks_per_node() << " MPI_ranks per node." << std::endl; + std::cout << " Requested: " << thread_count << " threads per process." << std::endl; + } // Init the array for used for arbitrarily sized atomics Impl::init_lock_array_host_space(); diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp index 7557bad7d9..71189cf7cc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsExec.hpp @@ -107,7 +107,7 @@ private: // Which thread am I stealing from currently int m_current_steal_target; // This thread's owned work_range - Kokkos::pair m_work_range KOKKOS_ALIGN(16); + Kokkos::pair m_work_range __attribute__((aligned(16))) ; // Team Offset if one thread determines work_range for others long m_team_work_index; @@ -191,13 +191,13 @@ public: // Fan-in reduction with highest ranking thread as the root for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); } if ( rev_rank ) { m_pool_state = ThreadsExec::Rendezvous ; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); + Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); } else { // Root thread does the reduction and broadcast @@ -233,13 +233,13 @@ public: // Fan-in reduction with highest ranking thread as the root for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); } if ( rev_rank ) { m_pool_state = ThreadsExec::Rendezvous ; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); + Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); } else { // Root thread does the reduction and broadcast @@ -268,7 +268,7 @@ public: ThreadsExec & fan = *m_pool_base[ rev_rank + ( 1 << i ) ] ; - Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active ); Join::join( f , reduce_memory() , fan.reduce_memory() ); } @@ -295,7 +295,7 @@ public: const int rev_rank = m_pool_size - ( m_pool_rank + 1 ); for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { - Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( m_pool_base[rev_rank+(1<m_pool_state , ThreadsExec::Active ); } } @@ -327,7 +327,7 @@ public: ThreadsExec & fan = *m_pool_base[ rev_rank + (1< ReductionAvailable (or ScanAvailable) - Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::Active ); Join::join( f , work_value , fan.reduce_memory() ); } @@ -345,8 +345,8 @@ public: // Wait: Active -> ReductionAvailable // Wait: ReductionAvailable -> ScanAvailable - Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active ); - Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable ); + Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( th.m_pool_state , ThreadsExec::ReductionAvailable ); Join::join( f , work_value + count , ((scalar_type *)th.reduce_memory()) + count ); } @@ -357,7 +357,7 @@ public: // Wait for all threads to complete inclusive scan // Wait: ScanAvailable -> Rendezvous - Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable ); + Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanAvailable ); } //-------------------------------- @@ -365,7 +365,7 @@ public: for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { ThreadsExec & fan = *m_pool_base[ rev_rank + (1< ScanAvailable - Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable ); + Impl::spinwait_while_equal( fan.m_pool_state , ThreadsExec::ReductionAvailable ); // Set: ScanAvailable -> Rendezvous fan.m_pool_state = ThreadsExec::Rendezvous ; } @@ -392,13 +392,13 @@ public: // Wait for all threads to copy previous thread's inclusive scan value // Wait for all threads: Rendezvous -> ScanCompleted for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { - Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Rendezvous ); + Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Rendezvous ); } if ( rev_rank ) { // Set: ScanAvailable -> ScanCompleted m_pool_state = ThreadsExec::ScanCompleted ; // Wait: ScanCompleted -> Active - Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted ); + Impl::spinwait_while_equal( m_pool_state , ThreadsExec::ScanCompleted ); } // Set: ScanCompleted -> Active for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { @@ -425,7 +425,7 @@ public: // Fan-in reduction with highest ranking thread as the root for ( int i = 0 ; i < m_pool_fan_size ; ++i ) { // Wait: Active -> Rendezvous - Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); + Impl::spinwait_while_equal( m_pool_base[ rev_rank + (1<m_pool_state , ThreadsExec::Active ); } for ( unsigned i = 0 ; i < count ; ++i ) { work_value[i+count] = work_value[i]; } @@ -433,7 +433,7 @@ public: if ( rev_rank ) { m_pool_state = ThreadsExec::Rendezvous ; // Wait: Rendezvous -> Active - Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); + Impl::spinwait_while_equal( m_pool_state , ThreadsExec::Rendezvous ); } else { // Root thread does the thread-scan before releasing threads diff --git a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp index 6060bf191f..7a51b41bfb 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_ThreadsTeam.hpp @@ -107,13 +107,13 @@ public: // Wait for fan-in threads for ( n = 1 ; ( ! ( m_team_rank_rev & n ) ) && ( ( j = m_team_rank_rev + n ) < m_team_size ) ; n <<= 1 ) { - Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active ); + Impl::spinwait_while_equal( m_team_base[j]->state() , ThreadsExec::Active ); } // If not root then wait for release if ( m_team_rank_rev ) { m_exec->state() = ThreadsExec::Rendezvous ; - Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous ); + Impl::spinwait_while_equal( m_exec->state() , ThreadsExec::Rendezvous ); } return ! m_team_rank_rev ; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp index 18ac7d26ad..be9f5a6f87 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Parallel.hpp @@ -180,12 +180,12 @@ public: // MDRangePolicy impl template< class FunctorType , class ... Traits > class ParallelFor< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , Kokkos::Threads > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; typedef typename MDRangePolicy::work_tag WorkTag ; @@ -193,7 +193,7 @@ private: typedef typename Policy::WorkRange WorkRange ; typedef typename Policy::member_type Member ; - typedef typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; + typedef typename Kokkos::Impl::HostIterateTile< MDRangePolicy, FunctorType, typename MDRangePolicy::work_tag, void > iterate_type; const FunctorType m_functor ; const MDRangePolicy m_mdr_policy ; @@ -396,9 +396,10 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; @@ -458,7 +459,7 @@ private: ( self.m_functor , range.begin() , range.end() , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); - exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } template @@ -484,7 +485,7 @@ private: work_index = exec.get_work_index(); } - exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } public: @@ -548,14 +549,14 @@ public: // MDRangePolicy impl template< class FunctorType , class ReducerType, class ... Traits > class ParallelReduce< FunctorType - , Kokkos::Experimental::MDRangePolicy< Traits ... > + , Kokkos::MDRangePolicy< Traits ... > , ReducerType , Kokkos::Threads > { private: - typedef Kokkos::Experimental::MDRangePolicy< Traits ... > MDRangePolicy ; + typedef Kokkos::MDRangePolicy< Traits ... > MDRangePolicy ; typedef typename MDRangePolicy::impl_range_policy Policy ; typedef typename MDRangePolicy::work_tag WorkTag ; @@ -564,16 +565,17 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; typedef typename ReducerTypeFwd::value_type ValueType; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRangePolicy + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRangePolicy , FunctorType , WorkTag , ValueType @@ -618,7 +620,7 @@ private: ( self.m_mdr_policy, self.m_functor , range.begin() , range.end() , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); - exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } template @@ -644,7 +646,7 @@ private: work_index = exec.get_work_index(); } - exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } public: @@ -725,9 +727,10 @@ private: typedef Kokkos::Impl::if_c< std::is_same::value, FunctorType, ReducerType> ReducerConditional; typedef typename ReducerConditional::type ReducerTypeFwd; + typedef typename Kokkos::Impl::if_c< std::is_same::value, WorkTag, void>::type WorkTagFwd; - typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd, WorkTag > ValueTraits ; - typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd, WorkTag > ValueInit ; + typedef Kokkos::Impl::FunctorValueTraits< ReducerTypeFwd , WorkTagFwd > ValueTraits ; + typedef Kokkos::Impl::FunctorValueInit< ReducerTypeFwd , WorkTagFwd > ValueInit ; typedef typename ValueTraits::pointer_type pointer_type ; typedef typename ValueTraits::reference_type reference_type ; @@ -767,7 +770,7 @@ private: ( self.m_functor , Member( & exec , self.m_policy , self.m_shared ) , ValueInit::init( ReducerConditional::select(self.m_functor , self.m_reducer) , exec.reduce_memory() ) ); - exec.template fan_in_reduce< ReducerTypeFwd , WorkTag >( ReducerConditional::select(self.m_functor , self.m_reducer) ); + exec.template fan_in_reduce< ReducerTypeFwd , WorkTagFwd >( ReducerConditional::select(self.m_functor , self.m_reducer) ); } public: diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index be904a1670..203e5b2b99 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -49,48 +49,50 @@ namespace Impl { template< class FunctorType , class ... Traits > class ParallelFor< FunctorType , - Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::WorkGraphPolicy< Traits ... > , Kokkos::Threads > - : public Kokkos::Impl::Experimental:: - WorkGraphExec< FunctorType, - Kokkos::Threads, - Traits ... - > { private: - typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; - typedef Kokkos::Impl::Experimental:: - WorkGraphExec Base ; + typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ; + typedef ParallelFor, + Kokkos::WorkGraphPolicy, Kokkos::Threads> Self ; + Policy m_policy ; + FunctorType m_functor ; + template< class TagType > typename std::enable_if< std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - Base::m_functor( i ); - } + exec_one( const std::int32_t w ) const noexcept + { m_functor( w ); } template< class TagType > typename std::enable_if< ! std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - const TagType t{} ; - Base::m_functor( t , i ); - } + exec_one( const std::int32_t w ) const noexcept + { const TagType t{}; m_functor( t , w ); } - inline void exec_one_thread() const { - for (std::int32_t i; (-1 != (i = Base::before_work())); ) { - exec_one< typename Policy::work_tag >( i ); - Base::after_work(i); + inline void exec_one_thread() const noexcept + { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for ( std::int32_t w = Policy::END_TOKEN ; + Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) { + if ( Policy::END_TOKEN != w ) { + exec_one< typename Policy::work_tag >( w ); + m_policy.completed_work(w); + } + } } - } - static inline void thread_main( ThreadsExec&, const void* arg ) { - const Self& self = *(static_cast(arg)); - self.exec_one_thread(); - } + static inline void thread_main( ThreadsExec&, const void* arg ) noexcept + { + const Self& self = *(static_cast(arg)); + self.exec_one_thread(); + } public: @@ -104,9 +106,9 @@ public: inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : Base( arg_functor, arg_policy ) - { - } + : m_policy( arg_policy ) + , m_functor( arg_functor ) + {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index 0171b209e5..e851a1e217 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -59,7 +59,7 @@ #include #include -namespace Kokkos { namespace Experimental { namespace Impl { +namespace Kokkos { namespace Impl { // Temporary, for testing new loop macros #define KOKKOS_ENABLE_NEW_LOOP_MACROS 1 @@ -1274,7 +1274,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, Tagged, typename std::enable_if< !std::i template -using is_void = std::is_same< T , void >; +using is_void_type = std::is_same< T , void >; template struct is_type_array : std::false_type @@ -1303,7 +1303,7 @@ template < typename RP , typename Tag , typename ValueType > -struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void::value >::type > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< is_void_type::value >::type > { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1781,7 +1781,7 @@ template < typename RP , typename Tag , typename ValueType > -struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void::value && !is_type_array::value >::type > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type::value && !is_type_array::value >::type > { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2268,7 +2268,7 @@ template < typename RP , typename Tag , typename ValueType > -struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void::value && is_type_array::value >::type > +struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_if< !is_void_type::value && is_type_array::value >::type > { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2750,6 +2750,8 @@ struct HostIterateTile < RP , Functor , Tag , ValueType , typename std::enable_i // Cuda uses DeviceIterateTile directly within md_parallel_for // TODO Once md_parallel_{for,reduce} removed, this can be removed +namespace Experimental { + // ParallelReduce - scalar reductions template < typename MDRange, typename Functor, typename ValueType = void > struct MDFunctor @@ -2759,11 +2761,11 @@ struct MDFunctor using value_type = ValueType; using work_tag = typename range_policy::work_tag; using index_type = typename range_policy::index_type; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange - , Functor - , work_tag - , value_type - >; + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , value_type + >; inline @@ -2804,11 +2806,11 @@ struct MDFunctor< MDRange, Functor, ValueType[] > using value_type = ValueType[]; using work_tag = typename range_policy::work_tag; using index_type = typename range_policy::index_type; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange - , Functor - , work_tag - , value_type - >; + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , value_type + >; inline @@ -2852,11 +2854,11 @@ struct MDFunctor< MDRange, Functor, void > using functor_type = Functor; using work_tag = typename range_policy::work_tag; using index_type = typename range_policy::index_type; - using iterate_type = typename Kokkos::Experimental::Impl::HostIterateTile< MDRange - , Functor - , work_tag - , void - >; + using iterate_type = typename Kokkos::Impl::HostIterateTile< MDRange + , Functor + , work_tag + , void + >; inline @@ -2887,8 +2889,9 @@ struct MDFunctor< MDRange, Functor, void > Functor m_func; }; +} // end namespace Experimental #undef KOKKOS_ENABLE_NEW_LOOP_MACROS -} } } //end namespace Kokkos::Experimental::Impl +} } //end namespace Kokkos::Impl #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp index 49fca9c855..bc0d969699 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Compare_Exchange_Strong.hpp @@ -49,6 +49,10 @@ #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP ) #define KOKKOS_ATOMIC_COMPARE_EXCHANGE_STRONG_HPP +#if defined(KOKKOS_ENABLE_CUDA) +#include +#endif + namespace Kokkos { //---------------------------------------------------------------------------- @@ -103,7 +107,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -115,7 +119,7 @@ T atomic_compare_exchange( volatile T * const dest , const T & compare , done = 1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp index 9ba3cae9fc..2f5bfd44e8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Exchange.hpp @@ -49,6 +49,10 @@ #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_EXCHANGE_HPP ) #define KOKKOS_ATOMIC_EXCHANGE_HPP +#if defined(KOKKOS_ENABLE_CUDA) +#include +#endif + namespace Kokkos { //---------------------------------------------------------------------------- @@ -126,7 +130,7 @@ T atomic_exchange( volatile T * const dest , #endif int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -137,7 +141,7 @@ T atomic_exchange( volatile T * const dest , done = 1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp index 2af1737c31..dfdd133a3c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Add.hpp @@ -49,6 +49,10 @@ #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_ADD_HPP ) #define KOKKOS_ATOMIC_FETCH_ADD_HPP +#if defined(KOKKOS_ENABLE_CUDA) +#include +#endif + namespace Kokkos { //---------------------------------------------------------------------------- @@ -139,7 +143,7 @@ T atomic_fetch_add( volatile T * const dest , T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -151,7 +155,7 @@ T atomic_fetch_add( volatile T * const dest , done = 1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp index b7c14052eb..fc8955d909 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Fetch_Sub.hpp @@ -49,6 +49,10 @@ #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_ATOMIC_FETCH_SUB_HPP ) #define KOKKOS_ATOMIC_FETCH_SUB_HPP +#if defined(KOKKOS_ENABLE_CUDA) +#include +#endif + namespace Kokkos { //---------------------------------------------------------------------------- @@ -117,7 +121,7 @@ T atomic_fetch_sub( volatile T * const dest , T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -128,7 +132,7 @@ T atomic_fetch_sub( volatile T * const dest , done = 1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp index f47ba1a98a..3a2a9e1f80 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Atomic_Generic.hpp @@ -46,6 +46,10 @@ #define KOKKOS_ATOMIC_GENERIC_HPP #include +#if defined(KOKKOS_ENABLE_CUDA) +#include +#endif + // Combination operands to be used in an Compare and Exchange based atomic operation namespace Kokkos { namespace Impl { @@ -242,7 +246,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest , // This is a way to (hopefully) avoid dead lock in a warp T return_val; int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -253,7 +257,7 @@ T atomic_fetch_oper( const Oper& op, volatile T * const dest , done=1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; #endif @@ -281,7 +285,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest , T return_val; // This is a way to (hopefully) avoid dead lock in a warp int done = 0; - unsigned int active = __ballot(1); + unsigned int active = KOKKOS_IMPL_CUDA_BALLOT(1); unsigned int done_active = 0; while (active!=done_active) { if(!done) { @@ -292,7 +296,7 @@ T atomic_oper_fetch( const Oper& op, volatile T * const dest , done=1; } } - done_active = __ballot(done); + done_active = KOKKOS_IMPL_CUDA_BALLOT(done); } return return_val; #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp index c5e73c8b26..18c61a209c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_BitOps.hpp @@ -48,6 +48,10 @@ #include #include +#if defined( __HCC_ACCELERATOR__ ) +#include +#endif + namespace Kokkos { namespace Impl { diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index ab6cffc7c3..2d03cd2f72 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -51,9 +51,12 @@ //---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { namespace { +bool g_is_initialized = false; +bool g_show_warnings = true; +} + +namespace Kokkos { namespace Impl { namespace { bool is_unsigned_int(const char* str) { @@ -75,6 +78,10 @@ void initialize_internal(const InitArguments& args) setenv("MEMKIND_HBW_NODES", "1", 0); #endif + if (args.disable_warnings) { + g_show_warnings = false; + } + // Protect declarations, to prevent "unused variable" warnings. #if defined( KOKKOS_ENABLE_OPENMP ) || defined( KOKKOS_ENABLE_THREADS ) || defined( KOKKOS_ENABLE_OPENMPTARGET ) const int num_threads = args.num_threads; @@ -125,10 +132,8 @@ setenv("MEMKIND_HBW_NODES", "1", 0); // struct, you may remove this line of code. (void) args; - if( std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value || - std::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value ) { - Kokkos::Serial::initialize(); - } + // Always initialize Serial if it is configure time enabled + Kokkos::Serial::initialize(); #endif #if defined( KOKKOS_ENABLE_OPENMPTARGET ) @@ -177,6 +182,7 @@ setenv("MEMKIND_HBW_NODES", "1", 0); #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif + g_is_initialized = true; } void finalize_internal( const bool all_spaces = false ) @@ -226,13 +232,12 @@ void finalize_internal( const bool all_spaces = false ) #endif #if defined( KOKKOS_ENABLE_SERIAL ) - if( std::is_same< Kokkos::Serial , Kokkos::DefaultExecutionSpace >::value || - std::is_same< Kokkos::Serial , Kokkos::HostSpace::execution_space >::value || - all_spaces ) { - if(Kokkos::Serial::is_initialized()) - Kokkos::Serial::finalize(); - } + if(Kokkos::Serial::is_initialized()) + Kokkos::Serial::finalize(); #endif + + g_is_initialized = false; + g_show_warnings = true; } void fence_internal() @@ -306,9 +311,7 @@ bool check_int_arg(char const* arg, char const* expected, int* value) { return true; } -} // namespace -} // namespace Impl -} // namespace Kokkos +}}} // namespace Kokkos::Impl::{unnamed} //---------------------------------------------------------------------------- @@ -319,6 +322,7 @@ void initialize(int& narg, char* arg[]) int num_threads = -1; int numa = -1; int device = -1; + bool disable_warnings = false; int kokkos_threads_found = 0; int kokkos_numa_found = 0; @@ -373,6 +377,7 @@ void initialize(int& narg, char* arg[]) } if((strncmp(arg[iarg],"--kokkos-ndevices",17) == 0) || !kokkos_ndevices_found) ndevices = atoi(num1_only); + delete [] num1_only; if( num2 != NULL ) { if(( !Impl::is_unsigned_int(num2+1) ) || (strlen(num2)==1) ) @@ -415,6 +420,12 @@ void initialize(int& narg, char* arg[]) } else { iarg++; } + } else if ( strcmp(arg[iarg],"--kokkos-disable-warnings") == 0) { + disable_warnings = true; + for(int k=iarg;k struct FunctorValueInit< FunctorType , ArgTag , T & , Enable > { KOKKOS_FORCEINLINE_FUNCTION static - T & init( const FunctorType & f , void * p ) + T & init( const FunctorType & , void * p ) { return *( new(p) T() ); }; }; diff --git a/lib/kokkos/core/src/impl/Kokkos_HostBarrier.cpp b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.cpp new file mode 100644 index 0000000000..e382acae32 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.cpp @@ -0,0 +1,204 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +#include +#include + +namespace Kokkos { namespace Impl { + +namespace { + +enum : int { HEADER_SIZE = HostBarrier::HEADER / sizeof(uint64_t) }; + +inline constexpr int length64( const int nthreads ) noexcept +{ + return (nthreads-1 + sizeof(uint64_t)-1) / sizeof(uint64_t); +} + +} // namespace + +void rendezvous_initialize( volatile void * buffer + , const int size + , const int rank + ) noexcept +{ + Kokkos::store_fence(); + + // ensure that the buffer has been zero'd out + constexpr uint8_t zero8 = static_cast(0); + constexpr uint64_t zero64 = static_cast(0); + + volatile uint64_t * header = reinterpret_cast(buffer); + + if (rank > 0) { + volatile uint8_t * bytes = reinterpret_cast(buffer) + RENDEZVOUS_HEADER; + + bytes[rank-1] = zero8; + + // last thread is responsible for zeroing out the final bytes of the last uint64_t + if (rank == size-1) { + const int tmp = (size-1) % sizeof(uint64_t); + const int rem = tmp ? sizeof(uint64_t) - tmp : 0; + for (int i=0; i(buffer) + HEADER_SIZE; + + // wait for other threads to finish initializing + for (int i=0; i(step + 1u) + ? step + 1u + : step + 2u + ; + + // if size == 1, it is incorrect for rank 0 to check the tail value of the buffer + // this optimization prevents a potential read of uninitialized memory + if ( size == 1 ) { return true; } + + const uint8_t byte_value = static_cast(step); + + // byte that is set in the spin_value rotates every time + // this prevents threads from overtaking the master thread + const uint64_t spin_value = static_cast(byte_value) << (byte_value&7); + + if ( rank > 0 ) { + volatile uint64_t * header = reinterpret_cast(buffer); + volatile uint8_t * bytes = reinterpret_cast(buffer) + RENDEZVOUS_HEADER; + + bytes[ rank-1 ] = byte_value; + + if ( active_wait ) { + spinwait_until_equal( *header, spin_value ); + } + else { + yield_until_equal( *header, spin_value ); + } + } + else { // rank 0 + volatile uint64_t * buff = reinterpret_cast(buffer) + HEADER_SIZE; + const int n = length64(size); + + uint64_t comp = byte_value; + comp = comp | (comp << 8); + comp = comp | (comp << 16); + comp = comp | (comp << 32); + + const int rem = (size-1) % sizeof(uint64_t); + + union { + volatile uint64_t value; + volatile uint8_t array[sizeof(uint64_t)]; + } tmp{}; + + for (int i=0; i(step); + const uint64_t spin_value = static_cast(byte_value) << (byte_value&7); + volatile uint64_t * header = reinterpret_cast(buffer); + + // Force all outstanding stores from this thread to retire before releasing + // the other threads. This forces correctness on systems with out-of-order + // memory (Power and ARM) + Kokkos::store_fence(); + + *header = spin_value; + + Kokkos::memory_fence(); +} + +}} // namespace Kokkos::Impl + diff --git a/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp new file mode 100644 index 0000000000..733b69e79f --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_HostBarrier.hpp @@ -0,0 +1,146 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_HOST_BARRIER_HPP +#define KOKKOS_HOST_BARRIER_HPP + +#include +#include + +namespace Kokkos { namespace Impl { + +//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ + +enum : int { RENDEZVOUS_ALIGNMENT = 128 + , RENDEZVOUS_HEADER = RENDEZVOUS_ALIGNMENT + }; + +inline constexpr int rendezvous_buffer_size( const int nthreads ) noexcept +{ + return RENDEZVOUS_HEADER + ((nthreads-1 + RENDEZVOUS_ALIGNMENT-1) / RENDEZVOUS_ALIGNMENT) * RENDEZVOUS_ALIGNMENT; +} + +void rendezvous_initialize( volatile void * buffer + , const int size + , const int rank + ) noexcept; + + +bool rendezvous( volatile void * buffer + , uint64_t & step + , const int size + , const int rank + , bool active_wait = true + ) noexcept; + +void rendezvous_release( volatile void * buffer + , const uint64_t step + ) noexcept; + + +//------------------------------------------------------------------------------ +//------------------------------------------------------------------------------ + + +class HostBarrier +{ +public: + + enum : int { ALIGNMENT = RENDEZVOUS_ALIGNMENT }; + enum : int { HEADER = ALIGNMENT}; + + enum Policy : int { ACTIVE, PASSIVE }; + + inline static constexpr int buffer_size( const int nthreads ) noexcept + { + return rendezvous_buffer_size(nthreads); + } + + HostBarrier( volatile void * arg_buffer + , int arg_size + , int arg_rank + , Policy arg_policy + ) noexcept + : m_buffer{arg_buffer} + , m_size{arg_size} + , m_rank{arg_rank} + , m_policy{arg_policy} + , m_step{0} + { + rendezvous_initialize( m_buffer, m_size, m_rank ); + } + + bool rendezvous() const noexcept + { + return Kokkos::Impl::rendezvous( m_buffer + , m_step + , m_size + , m_rank + , m_policy == ACTIVE + ); + } + + void rendezvous_release() const noexcept + { + Kokkos::Impl::rendezvous_release( m_buffer, m_step ); + } + +private: + volatile void * m_buffer ; + const int m_size ; + const int m_rank ; + const Policy m_policy ; + mutable uint64_t m_step ; + +private: + HostBarrier( const HostBarrier & ) = delete; + HostBarrier( HostBarrier && ) = delete; + HostBarrier & operator=( const HostBarrier & ) = delete; + HostBarrier & operator=( HostBarrier && ) = delete; +}; + +}} // namespace Kokkos::Impl + +#endif // KOKKOS_HOST_BARRIER_HPP + diff --git a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp index a5a73ddebb..4cec5ebad9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostSpace.cpp @@ -254,7 +254,12 @@ void * HostSpace::allocate( const size_t arg_alloc_size ) const } -void HostSpace::deallocate( void * const arg_alloc_ptr , const size_t arg_alloc_size ) const +void HostSpace::deallocate( void * const arg_alloc_ptr + , const size_t +#if defined( KOKKOS_IMPL_POSIX_MMAP_FLAGS ) + arg_alloc_size +#endif + ) const { if ( arg_alloc_ptr ) { @@ -409,7 +414,7 @@ SharedAllocationRecord< Kokkos::HostSpace , void >::get_record( void * alloc_ptr // Iterate records to print orphaned memory ... void SharedAllocationRecord< Kokkos::HostSpace , void >:: -print_records( std::ostream & s , const Kokkos::HostSpace & space , bool detail ) +print_records( std::ostream & s , const Kokkos::HostSpace & , bool detail ) { SharedAllocationRecord< void , void >::print_host_accessible_records( s , "HostSpace" , & s_root_record , detail ); } diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp index 047b262422..c2c6e45ef8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.cpp @@ -206,147 +206,6 @@ void HostThreadTeamData::disband_team() m_team_rendezvous_step = 0 ; } -//---------------------------------------------------------------------------- -/* pattern for rendezvous - * - * if ( rendezvous() ) { - * ... all other threads are still in team_rendezvous() ... - * rendezvous_release(); - * ... all other threads are released from team_rendezvous() ... - * } - */ - -int HostThreadTeamData::rendezvous( int64_t * const buffer - , int & rendezvous_step - , int const size - , int const rank ) noexcept -{ - enum : int { shift_byte = 3 }; - enum : int { size_byte = ( 01 << shift_byte ) }; // == 8 - enum : int { mask_byte = size_byte - 1 }; - - enum : int { shift_mem_cycle = 2 }; - enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 - enum : int { mask_mem_cycle = size_mem_cycle - 1 }; - - // Cycle step values: 1 <= step <= size_val_cycle - // An odd multiple of memory cycle so that when a memory location - // is reused it has a different value. - // Must be representable within a single byte: size_val_cycle < 16 - - enum : int { size_val_cycle = 3 * size_mem_cycle }; - - // Requires: - // Called by rank = [ 0 .. size ) - // buffer aligned to int64_t[4] - - // A sequence of rendezvous uses four cycled locations in memory - // and non-equal cycled synchronization values to - // 1) prevent rendezvous from overtaking one another and - // 2) give each spin wait location an int64_t[4] span - // so that it has its own cache line. - - const int step = ( rendezvous_step % size_val_cycle ) + 1 ; - - rendezvous_step = step ; - - // The leading int64_t[4] span is for thread 0 to write - // and all other threads to read spin-wait. - // sync_offset is the index into this array for this step. - - const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle ; - - if ( rank ) { - - const int group_begin = rank << shift_byte ; // == rank * size_byte - - if ( group_begin < size ) { - - // This thread waits for threads - // [ group_begin .. group_begin + 8 ) - // [ rank*8 .. rank*8 + 8 ) - // to write to their designated bytes. - - const int end = group_begin + size_byte < size - ? size_byte : size - group_begin ; - - int64_t value = 0 ; - - for ( int i = 0 ; i < end ; ++i ) { - ((int8_t*) & value )[i] = int8_t( step ); - } - // Do not REMOVE this store fence!!! - // Makes stuff hang on GCC with more than 8 threads - store_fence(); - spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] - , value ); - } - - { - // This thread sets its designated byte. - // ( rank % size_byte ) + - // ( ( rank / size_byte ) * size_byte * size_mem_cycle ) + - // ( sync_offset * size_byte ) - const int offset = ( rank & mask_byte ) - + ( ( rank & ~mask_byte ) << shift_mem_cycle ) - + ( sync_offset << shift_byte ); - - // All of this thread's previous memory stores must be complete before - // this thread stores the step value at this thread's designated byte - // in the shared synchronization array. - - Kokkos::memory_fence(); - - ((volatile int8_t*) buffer)[ offset ] = int8_t( step ); - - // Memory fence to push the previous store out - Kokkos::memory_fence(); - } - - // Wait for thread 0 to release all other threads - - spinwait_until_equal( buffer[ step & mask_mem_cycle ] , int64_t(step) ); - - } - else { - // Thread 0 waits for threads [1..7] - // to write to their designated bytes. - - const int end = size_byte < size ? 8 : size ; - - int64_t value = 0 ; - for ( int i = 1 ; i < end ; ++i ) { - ((int8_t *) & value)[i] = int8_t( step ); - } - - spinwait_until_equal( buffer[ sync_offset ], value ); - } - - return rank ? 0 : 1 ; -} - -void HostThreadTeamData:: - rendezvous_release( int64_t * const buffer - , int const rendezvous_step ) noexcept -{ - enum : int { shift_mem_cycle = 2 }; - enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 - enum : int { mask_mem_cycle = size_mem_cycle - 1 }; - - // Requires: - // Called after team_rendezvous - // Called only by true == team_rendezvous(root) - - // Memory fence to be sure all previous writes are complete: - Kokkos::memory_fence(); - - ((volatile int64_t*) buffer)[ rendezvous_step & mask_mem_cycle ] = - int64_t( rendezvous_step ); - - // Memory fence to push the store out - Kokkos::memory_fence(); -} - //---------------------------------------------------------------------------- int HostThreadTeamData::get_work_stealing() noexcept diff --git a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp index 7facc0a410..dc3b89c7c7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_HostThreadTeam.hpp @@ -50,7 +50,7 @@ #include #include #include -#include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -113,50 +113,29 @@ private: int m_league_size ; int m_work_chunk ; int m_steal_rank ; // work stealing rank - int mutable m_team_rendezvous_step ; + uint64_t mutable m_pool_rendezvous_step ; + uint64_t mutable m_team_rendezvous_step ; HostThreadTeamData * team_member( int r ) const noexcept { return ((HostThreadTeamData**)(m_pool_scratch+m_pool_members))[m_team_base+r]; } - // Rendezvous pattern: - // if ( rendezvous(root) ) { - // ... only root thread here while all others wait ... - // rendezvous_release(); - // } - // else { - // ... all other threads release here ... - // } - // - // Requires: buffer[ ( max_threads / 8 ) * 4 + 4 ]; 0 == max_threads % 8 - // - static - int rendezvous( int64_t * const buffer - , int & rendezvous_step - , int const size - , int const rank ) noexcept ; - - static - void rendezvous_release( int64_t * const buffer - , int const rendezvous_step ) noexcept ; - public: inline int team_rendezvous( int const root ) const noexcept { return 1 == m_team_size ? 1 : - HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size - , ( m_team_rank + m_team_size - root ) % m_team_size ); + , ( m_team_rank + m_team_size - root ) % m_team_size + ); } inline int team_rendezvous() const noexcept { return 1 == m_team_size ? 1 : - HostThreadTeamData:: rendezvous( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step , m_team_size @@ -167,7 +146,6 @@ public: void team_rendezvous_release() const noexcept { if ( 1 < m_team_size ) { - HostThreadTeamData:: rendezvous_release( m_team_scratch + m_team_rendezvous , m_team_rendezvous_step ); } @@ -176,30 +154,30 @@ public: inline int pool_rendezvous() const noexcept { - static constexpr int yield_wait = + static constexpr bool active_wait = #if defined( KOKKOS_COMPILER_IBM ) // If running on IBM POWER architecture the global // level rendzvous should immediately yield when // waiting for other threads in the pool to arrive. - 1 + false #else - 0 + true #endif ; return 1 == m_pool_size ? 1 : - Kokkos::Impl:: rendezvous( m_pool_scratch + m_pool_rendezvous + , m_pool_rendezvous_step , m_pool_size , m_pool_rank - , yield_wait ); + , active_wait + ); } inline void pool_rendezvous_release() const noexcept { if ( 1 < m_pool_size ) { - Kokkos::Impl:: - rendezvous_release( m_pool_scratch + m_pool_rendezvous ); + rendezvous_release( m_pool_scratch + m_pool_rendezvous, m_pool_rendezvous_step ); } } @@ -225,6 +203,7 @@ public: , m_league_size(1) , m_work_chunk(0) , m_steal_rank(0) + , m_pool_rendezvous_step(0) , m_team_rendezvous_step(0) {} diff --git a/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp b/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp new file mode 100644 index 0000000000..d7fe74a6d8 --- /dev/null +++ b/lib/kokkos/core/src/impl/Kokkos_MemoryPool.cpp @@ -0,0 +1,125 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +/* Verify size constraints: + * min_block_alloc_size <= max_block_alloc_size + * max_block_alloc_size <= min_superblock_size + * min_superblock_size <= max_superblock_size + * min_superblock_size <= min_total_alloc_size + * min_superblock_size <= min_block_alloc_size * + * max_block_per_superblock + */ +void memory_pool_bounds_verification + ( size_t min_block_alloc_size + , size_t max_block_alloc_size + , size_t min_superblock_size + , size_t max_superblock_size + , size_t max_block_per_superblock + , size_t min_total_alloc_size + ) +{ + const size_t max_superblock = + min_block_alloc_size * max_block_per_superblock ; + + if ( ( size_t(max_superblock_size) < min_superblock_size ) || + ( min_total_alloc_size < min_superblock_size ) || + ( max_superblock < min_superblock_size ) || + ( min_superblock_size < max_block_alloc_size ) || + ( max_block_alloc_size < min_block_alloc_size ) ) { + + std::ostringstream msg ; + + msg << "Kokkos::MemoryPool size constraint violation" ; + + if ( size_t(max_superblock_size) < min_superblock_size ) { + msg << " : max_superblock_size(" + << max_superblock_size + << ") < min_superblock_size(" + << min_superblock_size << ")" ; + } + + if ( min_total_alloc_size < min_superblock_size ) { + msg << " : min_total_alloc_size(" + << min_total_alloc_size + << ") < min_superblock_size(" + << min_superblock_size << ")" ; + } + + if ( max_superblock < min_superblock_size ) { + msg << " : max_superblock(" + << max_superblock + << ") < min_superblock_size(" + << min_superblock_size << ")" ; + } + + if ( min_superblock_size < max_block_alloc_size ) { + msg << " : min_superblock_size(" + << min_superblock_size + << ") < max_block_alloc_size(" + << max_block_alloc_size << ")" ; + } + + if ( max_block_alloc_size < min_block_alloc_size ) { + msg << " : max_block_alloc_size(" + << max_block_alloc_size + << ") < min_block_alloc_size(" + << min_block_alloc_size << ")" ; + } + + Kokkos::Impl::throw_runtime_exception( msg.str() ); + } +} + +} +} + diff --git a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp index eedf3d559e..abd9fe6724 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Memory_Fence.hpp @@ -44,6 +44,11 @@ #include #if defined( KOKKOS_ATOMIC_HPP ) && ! defined( KOKKOS_MEMORY_FENCE_HPP ) #define KOKKOS_MEMORY_FENCE_HPP + +#if !defined(_OPENMP) +#include +#endif + namespace Kokkos { //---------------------------------------------------------------------------- @@ -51,25 +56,12 @@ namespace Kokkos { KOKKOS_FORCEINLINE_FUNCTION void memory_fence() { -#if defined( __CUDA_ARCH__ ) +#if defined( __CUDA_ARCH__ ) __threadfence(); -#elif defined( KOKKOS_ENABLE_ROCM_ATOMICS ) - amp_barrier(CLK_LOCAL_MEM_FENCE | CLK_GLOBAL_MEM_FENCE); -#elif defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) - asm volatile ( - "mfence" ::: "memory" - ); -#elif defined( KOKKOS_ENABLE_GNU_ATOMICS ) || \ - ( defined( KOKKOS_COMPILER_NVCC ) && defined( KOKKOS_ENABLE_INTEL_ATOMICS ) ) - __sync_synchronize(); -#elif defined( KOKKOS_ENABLE_INTEL_ATOMICS ) - _mm_mfence(); -#elif defined( KOKKOS_ENABLE_OPENMP_ATOMICS ) +#elif defined( _OPENMP ) #pragma omp flush -#elif defined( KOKKOS_ENABLE_WINDOWS_ATOMICS ) - MemoryBarrier(); #else - #error "Error: memory_fence() not defined" + std::atomic_thread_fence( std::memory_order_seq_cst ); #endif } @@ -81,12 +73,12 @@ void memory_fence() KOKKOS_FORCEINLINE_FUNCTION void store_fence() { -#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) - asm volatile ( - "sfence" ::: "memory" - ); +#if defined( __CUDA_ARCH__ ) + __threadfence(); +#elif defined( _OPENMP ) + #pragma omp flush #else - memory_fence(); + std::atomic_thread_fence( std::memory_order_seq_cst ); #endif } @@ -98,12 +90,12 @@ void store_fence() KOKKOS_FORCEINLINE_FUNCTION void load_fence() { -#if defined( KOKKOS_ENABLE_ASM ) && defined( KOKKOS_ENABLE_ISA_X86_64 ) - asm volatile ( - "lfence" ::: "memory" - ); +#if defined( __CUDA_ARCH__ ) + __threadfence(); +#elif defined( _OPENMP ) + #pragma omp flush #else - memory_fence(); + std::atomic_thread_fence( std::memory_order_seq_cst ); #endif } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp index 608d514c79..a90bd507d5 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.cpp @@ -69,6 +69,13 @@ static deallocateDataFunction deallocateDataCallee = nullptr; static beginDeepCopyFunction beginDeepCopyCallee = nullptr; static endDeepCopyFunction endDeepCopyCallee = nullptr; +static createProfileSectionFunction createSectionCallee = nullptr; +static startProfileSectionFunction startSectionCallee = nullptr; +static stopProfileSectionFunction stopSectionCallee = nullptr; +static destroyProfileSectionFunction destroySectionCallee = nullptr; + +static profileEventFunction profileEventCallee = nullptr; + SpaceHandle::SpaceHandle(const char* space_name) { strncpy(name,space_name,64); } @@ -162,6 +169,37 @@ void endDeepCopy() { } } +void createProfileSection(const std::string& sectionName, uint32_t* secID) { + + if(nullptr != createSectionCallee) { + (*createSectionCallee)(sectionName.c_str(), secID); + } +} + +void startSection(const uint32_t secID) { + if(nullptr != startSectionCallee) { + (*startSectionCallee)(secID); + } +} + +void stopSection(const uint32_t secID) { + if(nullptr != stopSectionCallee) { + (*stopSectionCallee)(secID); + } +} + +void destroyProfileSection(const uint32_t secID) { + if(nullptr != destroySectionCallee) { + (*destroySectionCallee)(secID); + } +} + +void markEvent(const std::string& eventName) { + if(nullptr != profileEventCallee) { + (*profileEventCallee)(eventName.c_str()); + } +} + void initialize() { // Make sure initialize calls happens only once @@ -230,7 +268,18 @@ void initialize() { beginDeepCopyCallee = *((beginDeepCopyFunction*) &p13); auto p14 = dlsym(firstProfileLibrary, "kokkosp_end_deep_copy"); endDeepCopyCallee = *((endDeepCopyFunction*) &p14); - + + auto p15 = dlsym(firstProfileLibrary, "kokkosp_create_profile_section"); + createSectionCallee = *((createProfileSectionFunction*) &p15); + auto p16 = dlsym(firstProfileLibrary, "kokkosp_start_profile_section"); + startSectionCallee = *((startProfileSectionFunction*) &p16); + auto p17 = dlsym(firstProfileLibrary, "kokkosp_stop_profile_section"); + stopSectionCallee = *((stopProfileSectionFunction*) &p17); + auto p18 = dlsym(firstProfileLibrary, "kokkosp_destroy_profile_section"); + destroySectionCallee = *((destroyProfileSectionFunction*) &p18); + + auto p19 = dlsym(firstProfileLibrary, "kokkosp_profile_event"); + profileEventCallee = *((profileEventFunction*) &p19); } } @@ -274,6 +323,13 @@ void finalize() { beginDeepCopyCallee = nullptr; endDeepCopyCallee = nullptr; + + createSectionCallee = nullptr; + startSectionCallee = nullptr; + stopSectionCallee = nullptr; + destroySectionCallee = nullptr; + + profileEventCallee = nullptr; } } } diff --git a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp index 2c2e524d9d..f348239e08 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Profiling_Interface.hpp @@ -45,6 +45,7 @@ #define KOKKOSP_INTERFACE_HPP #include + #if defined(KOKKOS_ENABLE_PROFILING) #include @@ -57,7 +58,7 @@ #include #include -#define KOKKOSP_INTERFACE_VERSION 20150628 +#define KOKKOSP_INTERFACE_VERSION 20171029 namespace Kokkos { namespace Profiling { @@ -81,6 +82,13 @@ typedef void (*popFunction)(); typedef void (*allocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); typedef void (*deallocateDataFunction)(const SpaceHandle, const char*, const void*, const uint64_t); +typedef void (*createProfileSectionFunction)(const char*, uint32_t*); +typedef void (*startProfileSectionFunction)(const uint32_t); +typedef void (*stopProfileSectionFunction)(const uint32_t); +typedef void (*destroyProfileSectionFunction)(const uint32_t); + +typedef void (*profileEventFunction)(const char*); + typedef void (*beginDeepCopyFunction)( SpaceHandle, const char*, const void*, SpaceHandle, const char*, const void*, @@ -99,6 +107,13 @@ void endParallelReduce(const uint64_t kernelID); void pushRegion(const std::string& kName); void popRegion(); +void createProfileSection(const std::string& sectionName, uint32_t* secID); +void startSection(const uint32_t secID); +void stopSection(const uint32_t secID); +void destroyProfileSection(const uint32_t secID); + +void markEvent(const std::string* evName); + void allocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); void deallocateData(const SpaceHandle space, const std::string label, const void* ptr, const uint64_t size); diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp b/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp deleted file mode 100644 index ac697fce4b..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 2.0 -// Copyright (2014) Sandia Corporation -// -// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#include -#include -#include -#include - -namespace Kokkos { namespace Impl { - -//---------------------------------------------------------------------------- -/* pattern for rendezvous - * - * if ( rendezvous() ) { - * ... all other threads are still in team_rendezvous() ... - * rendezvous_release(); - * ... all other threads are released from team_rendezvous() ... - * } - */ - -int rendezvous( volatile int64_t * const buffer - , int const size - , int const rank - , int const slow - ) noexcept -{ - enum : int { shift_byte = 3 }; - enum : int { size_byte = ( 01 << shift_byte ) }; // == 8 - enum : int { mask_byte = size_byte - 1 }; - - enum : int { shift_mem_cycle = 2 }; - enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 - enum : int { mask_mem_cycle = size_mem_cycle - 1 }; - - // Cycle step values: 1 <= step <= size_val_cycle - // An odd multiple of memory cycle so that when a memory location - // is reused it has a different value. - // Must be representable within a single byte: size_val_cycle < 16 - - enum : int { size_val_cycle = 3 * size_mem_cycle }; - - // Requires: - // Called by rank = [ 0 .. size ) - // buffer aligned to int64_t[4] - - // A sequence of rendezvous uses four cycled locations in memory - // and non-equal cycled synchronization values to - // 1) prevent rendezvous from overtaking one another and - // 2) give each spin wait location an int64_t[4] span - // so that it has its own cache line. - - const int64_t step = (buffer[0] % size_val_cycle ) + 1 ; - - // The leading int64_t[4] span is for thread 0 to write - // and all other threads to read spin-wait. - // sync_offset is the index into this array for this step. - - const int sync_offset = ( step & mask_mem_cycle ) + size_mem_cycle + size_mem_cycle ; - - if ( rank ) { - - const int group_begin = rank << shift_byte ; // == rank * size_byte - - if ( group_begin < size ) { - - // This thread waits for threads - // [ group_begin .. group_begin + 8 ) - // [ rank*8 .. rank*8 + 8 ) - // to write to their designated bytes. - - const int end = group_begin + size_byte < size - ? size_byte : size - group_begin ; - - int64_t value = 0; - for ( int i = 0 ; i < end ; ++i ) { - value |= step << (i * size_byte ); - } - - store_fence(); // This should not be needed but fixes #742 - - if ( slow ) { - yield_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] - , value ); - } - else { - spinwait_until_equal( buffer[ (rank << shift_mem_cycle) + sync_offset ] - , value ); - } - } - - { - // This thread sets its designated byte. - // ( rank % size_byte ) + - // ( ( rank / size_byte ) * size_byte * size_mem_cycle ) + - // ( sync_offset * size_byte ) - const int offset = ( rank & mask_byte ) - + ( ( rank & ~mask_byte ) << shift_mem_cycle ) - + ( sync_offset << shift_byte ); - - // All of this thread's previous memory stores must be complete before - // this thread stores the step value at this thread's designated byte - // in the shared synchronization array. - - Kokkos::memory_fence(); - - ((volatile int8_t*) buffer)[ offset ] = int8_t( step ); - - // Memory fence to push the previous store out - Kokkos::memory_fence(); - } - - // Wait for thread 0 to release all other threads - - if ( slow ) { - yield_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); - } - else { - spinwait_until_equal( buffer[ (step & mask_mem_cycle) + size_mem_cycle ] , int64_t(step) ); - } - } - else { - // Thread 0 waits for threads [1..7] - // to write to their designated bytes. - - const int end = size_byte < size ? 8 : size ; - - int64_t value = 0; - for ( int i = 1 ; i < end ; ++i ) { - value |= step << (i * size_byte ); - } - - if ( slow ) { - yield_until_equal( buffer[ sync_offset ], value ); - } - else { - spinwait_until_equal( buffer[ sync_offset ], value ); - } - } - - return rank ? 0 : 1 ; -} - -void rendezvous_release( volatile int64_t * const buffer ) noexcept -{ - enum : int { shift_mem_cycle = 2 }; - enum : int { size_mem_cycle = ( 01 << shift_mem_cycle ) }; // == 4 - enum : int { mask_mem_cycle = size_mem_cycle - 1 }; - enum : int { size_val_cycle = 3 * size_mem_cycle }; - - // Requires: - // Called after team_rendezvous - // Called only by true == team_rendezvous(root) - - // update step - const int64_t step = (buffer[0] % size_val_cycle ) + 1; - buffer[0] = step; - - // Memory fence to be sure all previous writes are complete: - Kokkos::memory_fence(); - - buffer[ (step & mask_mem_cycle) + size_mem_cycle ] = step; - - // Memory fence to push the store out - Kokkos::memory_fence(); -} - -}} // namespace Kokkos::Impl - diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp index dfbeba461e..f3b048d58c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial.cpp @@ -60,6 +60,8 @@ namespace { HostThreadTeamData g_serial_thread_team_data ; +bool g_serial_is_initialized = false; + } // Resize thread team data scratch memory @@ -136,9 +138,9 @@ HostThreadTeamData * serial_get_thread_team_data() namespace Kokkos { -int Serial::is_initialized() +bool Serial::is_initialized() { - return 1 ; + return Impl::g_serial_is_initialized ; } void Serial::initialize( unsigned threads_count @@ -158,6 +160,8 @@ void Serial::initialize( unsigned threads_count #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::initialize(); #endif + + Impl::g_serial_is_initialized = true; } void Serial::finalize() @@ -177,6 +181,8 @@ void Serial::finalize() #if defined(KOKKOS_ENABLE_PROFILING) Kokkos::Profiling::finalize(); #endif + + Impl::g_serial_is_initialized = false; } const char* Serial::name() { return "Serial"; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp index dc30ffe9e0..6f247608d9 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Serial_WorkGraphPolicy.hpp @@ -49,51 +49,50 @@ namespace Impl { template< class FunctorType , class ... Traits > class ParallelFor< FunctorType , - Kokkos::Experimental::WorkGraphPolicy< Traits ... > , + Kokkos::WorkGraphPolicy< Traits ... > , Kokkos::Serial > - : public Kokkos::Impl::Experimental:: - WorkGraphExec< FunctorType, - Kokkos::Serial, - Traits ... - > { private: - typedef Kokkos::Experimental::WorkGraphPolicy< Traits ... > Policy ; - typedef Kokkos::Impl::Experimental:: - WorkGraphExec Base ; + typedef Kokkos::WorkGraphPolicy< Traits ... > Policy ; + + Policy m_policy ; + FunctorType m_functor ; template< class TagType > typename std::enable_if< std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - Base::m_functor( i ); - } + exec_one( const std::int32_t w ) const noexcept + { m_functor( w ); } template< class TagType > typename std::enable_if< ! std::is_same< TagType , void >::value >::type - exec_one(const typename Policy::member_type& i) const { - const TagType t{} ; - Base::m_functor( t , i ); - } + exec_one( const std::int32_t w ) const noexcept + { const TagType t{}; m_functor( t , w ); } public: inline - void execute() - { - for (std::int32_t i; (-1 != (i = Base::before_work())); ) { - exec_one< typename Policy::work_tag >( i ); - Base::after_work(i); + void execute() const noexcept + { + // Spin until COMPLETED_TOKEN. + // END_TOKEN indicates no work is currently available. + + for ( std::int32_t w = Policy::END_TOKEN ; + Policy::COMPLETED_TOKEN != ( w = m_policy.pop_work() ) ; ) { + if ( Policy::END_TOKEN != w ) { + exec_one< typename Policy::work_tag >( w ); + m_policy.completed_work(w); + } + } } - } inline ParallelFor( const FunctorType & arg_functor , const Policy & arg_policy ) - : Base( arg_functor, arg_policy ) - { - } + : m_policy( arg_policy ) + , m_functor( arg_functor ) + {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp index af79523e0c..87228ea784 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.cpp @@ -306,7 +306,7 @@ print_host_accessible_records( std::ostream & s , reinterpret_cast( r->m_dealloc ) , r->m_alloc_ptr->m_label ); - std::cout << buffer ; + s << buffer ; r = r->m_next ; } while ( r != root ); } @@ -334,7 +334,7 @@ print_host_accessible_records( std::ostream & s else { snprintf( buffer , 256 , "%s [ 0 + 0 ]\n" , space_name ); } - std::cout << buffer ; + s << buffer ; r = r->m_next ; } while ( r != root ); } diff --git a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp index 2e3cc1a163..73fabe0e0a 100644 --- a/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_SharedAlloc.hpp @@ -294,9 +294,13 @@ public: template< class MemorySpace > constexpr - SharedAllocationRecord< MemorySpace , void > & - get_record() const - { return * static_cast< SharedAllocationRecord< MemorySpace , void > * >( m_record ); } + SharedAllocationRecord< MemorySpace , void > * + get_record() const noexcept + { + return ( m_record_bits & DO_NOT_DEREF_FLAG ) + ? (SharedAllocationRecord< MemorySpace,void>*) 0 + : static_cast*>(m_record); + } template< class MemorySpace > std::string get_label() const @@ -323,6 +327,16 @@ public: return (m_record_bits & (~DO_NOT_DEREF_FLAG)) != 0; } + KOKKOS_FORCEINLINE_FUNCTION + void clear() + { + // If this is tracking then must decrement + KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT + // Reset to default constructed value. + m_record_bits = DO_NOT_DEREF_FLAG ; + } + + // Copy: KOKKOS_FORCEINLINE_FUNCTION ~SharedAllocationTracker() { KOKKOS_IMPL_SHARED_ALLOCATION_TRACKER_DECREMENT } diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp index 3d3f83ed85..f151071a9f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.cpp @@ -48,7 +48,7 @@ #include #include -#if defined( KOKKOS_ENABLE_STDTHREAD ) +#if defined( KOKKOS_ENABLE_STDTHREAD) || defined( _WIN32 ) #include #elif !defined( _WIN32 ) #include @@ -63,9 +63,8 @@ namespace Kokkos { namespace Impl { -namespace { -void host_thread_yield( const uint32_t i , const int force_yield ) +void host_thread_yield( const uint32_t i , const WaitMode mode ) { static constexpr uint32_t sleep_limit = 1 << 13 ; static constexpr uint32_t yield_limit = 1 << 12 ; @@ -76,28 +75,26 @@ void host_thread_yield( const uint32_t i , const int force_yield ) // Attempt to put the thread to sleep for 'c' milliseconds - #if defined( KOKKOS_ENABLE_STDTHREAD ) - std::this_thread::sleep_for( std::chrono::nanoseconds( c * 1000 ) ) - #elif !defined( _WIN32 ) + #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 ) + auto start = std::chrono::high_resolution_clock::now(); + std::this_thread::yield(); + std::this_thread::sleep_until( start + std::chrono::nanoseconds( c * 1000 ) ); + #else timespec req ; req.tv_sec = 0 ; req.tv_nsec = 1000 * c ; nanosleep( &req, nullptr ); - #else /* defined( _WIN32 ) IS Microsoft Windows */ - Sleep(c); #endif } - else if ( force_yield || yield_limit < i ) { + else if ( mode == WaitMode::PASSIVE || yield_limit < i ) { // Attempt to yield thread resources to runtime - #if defined( KOKKOS_ENABLE_STDTHREAD ) + #if defined( KOKKOS_ENABLE_STDTHREAD ) || defined( _WIN32 ) std::this_thread::yield(); - #elif !defined( _WIN32 ) + #else sched_yield(); - #else /* defined( _WIN32 ) IS Microsoft Windows */ - YieldProcessor(); #endif } @@ -110,9 +107,9 @@ void host_thread_yield( const uint32_t i , const int force_yield ) for ( int k = 0 ; k < c ; ++k ) { #if defined( __amd64 ) || defined( __amd64__ ) || \ defined( __x86_64 ) || defined( __x86_64__ ) - #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ asm volatile( "nop\n" ); - #else + #else __asm__ __volatile__( "nop\n" ); #endif #elif defined(__PPC64__) @@ -123,86 +120,22 @@ void host_thread_yield( const uint32_t i , const int force_yield ) { // Insert memory pause - #if defined( __amd64 ) || defined( __amd64__ ) || \ - defined( __x86_64 ) || defined( __x86_64__ ) - #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ + #if defined( __amd64 ) || defined( __amd64__ ) || \ + defined( __x86_64 ) || defined( __x86_64__ ) + #if !defined( _WIN32 ) /* IS NOT Microsoft Windows */ asm volatile( "pause\n":::"memory" ); - #else + #else __asm__ __volatile__( "pause\n":::"memory" ); #endif #elif defined(__PPC64__) - asm volatile( "or 27, 27, 27" ::: "memory" ); + asm volatile( "or 27, 27, 27" ::: "memory" ); #endif } #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -}}} // namespace Kokkos::Impl::{anonymous} - -/*--------------------------------------------------------------------------*/ - -namespace Kokkos { -namespace Impl { - -void spinwait_while_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); - Kokkos::load_fence(); -} - -void spinwait_while_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,0); - Kokkos::load_fence(); -} - -void spinwait_until_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,0); - Kokkos::load_fence(); -} - -void yield_while_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); - Kokkos::load_fence(); -} - -void yield_until_equal( volatile int32_t & flag , const int32_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); - Kokkos::load_fence(); -} - -void yield_while_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value == flag ) host_thread_yield(++i,1); - Kokkos::load_fence(); -} - -void yield_until_equal( volatile int64_t & flag , const int64_t value ) -{ - Kokkos::store_fence(); - uint32_t i = 0 ; while( value != flag ) host_thread_yield(++i,1); - Kokkos::load_fence(); -} - -} /* namespace Impl */ -} /* namespace Kokkos */ +}} // namespace Kokkos::Impl #else void KOKKOS_CORE_SRC_IMPL_SPINWAIT_PREVENT_LINK_ERROR() {} diff --git a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp index b49e308566..8846f1ca51 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Spinwait.hpp @@ -46,47 +46,95 @@ #define KOKKOS_SPINWAIT_HPP #include +#include #include +#include + namespace Kokkos { namespace Impl { #if defined( KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HOST ) -void spinwait_while_equal( volatile int32_t & flag , const int32_t value ); -void spinwait_until_equal( volatile int32_t & flag , const int32_t value ); +enum class WaitMode : int { + ACTIVE // Used for tight loops to keep threads active longest + , PASSIVE // Used to quickly yield the thread to quite down the system +}; -void spinwait_while_equal( volatile int64_t & flag , const int64_t value ); -void spinwait_until_equal( volatile int64_t & flag , const int64_t value ); -void yield_while_equal( volatile int32_t & flag , const int32_t value ); -void yield_until_equal( volatile int32_t & flag , const int32_t value ); +void host_thread_yield( const uint32_t i , const WaitMode mode ); -void yield_while_equal( volatile int64_t & flag , const int64_t value ); -void yield_until_equal( volatile int64_t & flag , const int64_t value ); + +template +typename std::enable_if< std::is_integral::value, void>::type +spinwait_while_equal( T const volatile & flag, const T value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; + while( value == flag ) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + +template +typename std::enable_if< std::is_integral::value, void>::type +yield_while_equal( T const volatile & flag, const T value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; + while( value == flag ) { + host_thread_yield(++i, WaitMode::PASSIVE); + } + Kokkos::load_fence(); +} + +template +typename std::enable_if< std::is_integral::value, void>::type +spinwait_until_equal( T const volatile & flag, const T value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; + while( value != flag ) { + host_thread_yield(++i, WaitMode::ACTIVE); + } + Kokkos::load_fence(); +} + +template +typename std::enable_if< std::is_integral::value, void>::type +yield_until_equal( T const volatile & flag, const T value ) +{ + Kokkos::store_fence(); + uint32_t i = 0 ; + while( value != flag ) { + host_thread_yield(++i, WaitMode::PASSIVE); + } + Kokkos::load_fence(); +} #else +template KOKKOS_INLINE_FUNCTION -void spinwait_while_equal( volatile int32_t & , const int32_t ) {} -KOKKOS_INLINE_FUNCTION -void spinwait_until_equal( volatile int32_t & , const int32_t ) {} +typename std::enable_if< std::is_integral::value, void>::type +spinwait_while_equal( T const volatile & flag, const T value ) {} +template KOKKOS_INLINE_FUNCTION -void spinwait_while_equal( volatile int64_t & , const int64_t ) {} -KOKKOS_INLINE_FUNCTION -void spinwait_until_equal( volatile int64_t & , const int64_t ) {} +typename std::enable_if< std::is_integral::value, void>::type +yield_while_equal( T const volatile & flag, const T value ) {} +template KOKKOS_INLINE_FUNCTION -void yield_while_equal( volatile int32_t & , const int32_t ) {} -KOKKOS_INLINE_FUNCTION -void yield_until_equal( volatile int32_t & , const int32_t ) {} +typename std::enable_if< std::is_integral::value, void>::type +spinwait_until_equal( T const volatile & flag, const T value ) {} +template KOKKOS_INLINE_FUNCTION -void yield_while_equal( volatile int64_t & , const int64_t ) {} -KOKKOS_INLINE_FUNCTION -void yield_until_equal( volatile int64_t & , const int64_t ) {} +typename std::enable_if< std::is_integral::value, void>::type +yield_until_equal( T const volatile & flag, const T value ) {} #endif diff --git a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp index 6300417576..b59548ea1d 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Traits.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Traits.hpp @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -435,21 +436,12 @@ struct power_of_two<1,true> /** \brief If power of two then return power, * otherwise return ~0u. */ -static KOKKOS_FORCEINLINE_FUNCTION +KOKKOS_FORCEINLINE_FUNCTION unsigned power_of_two_if_valid( const unsigned N ) { unsigned p = ~0u ; - if ( N && ! ( N & ( N - 1 ) ) ) { -#if defined( __CUDA_ARCH__ ) && defined( KOKKOS_ENABLE_CUDA ) - p = __ffs(N) - 1 ; -#elif defined( __GNUC__ ) || defined( __GNUG__ ) - p = __builtin_ffs(N) - 1 ; -#elif defined( __INTEL_COMPILER ) - p = _bit_scan_forward(N); -#else - p = 0 ; - for ( unsigned j = 1 ; ! ( N & j ) ; j <<= 1 ) { ++p ; } -#endif + if ( is_integral_power_of_two ( N ) ) { + p = bit_scan_forward ( N ) ; } return p ; } diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp index f32c6bb2ee..70522d4067 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp @@ -111,7 +111,9 @@ struct ViewCtorProp< void , CommonViewAllocProp > using type = CommonViewAllocProp ; + KOKKOS_INLINE_FUNCTION ViewCtorProp( const type & arg ) : value( arg ) {} + KOKKOS_INLINE_FUNCTION ViewCtorProp( type && arg ) : value( arg ) {} type value ; @@ -128,6 +130,7 @@ struct ViewCtorProp< void , std::integral_constant > ViewCtorProp & operator = ( const ViewCtorProp & ) = default ; template< typename P > + KOKKOS_INLINE_FUNCTION ViewCtorProp( const P & ) {} }; diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp index b2adcc9f06..413b55298c 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp @@ -369,9 +369,9 @@ private: template< size_t ... DimArgs > KOKKOS_FORCEINLINE_FUNCTION - bool set( unsigned domain_rank - , unsigned range_rank - , const ViewDimension< DimArgs ... > & dim ) + bool set( unsigned + , unsigned + , const ViewDimension< DimArgs ... > & ) { return true ; } template< class T , size_t ... DimArgs , class ... Args > @@ -1047,7 +1047,7 @@ struct ViewOffset< Dimension , Kokkos::LayoutLeft template< class DimRHS > KOKKOS_INLINE_FUNCTION constexpr ViewOffset( - const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & rhs , + const ViewOffset< DimRHS , Kokkos::LayoutLeft , void > & , const SubviewExtents< DimRHS::rank , dimension_type::rank > & sub ) : m_dim( sub.range_extent(0), 0, 0, 0, 0, 0, 0, 0 ) { @@ -1252,7 +1252,7 @@ public: template< unsigned TrivialScalarSize > KOKKOS_INLINE_FUNCTION constexpr ViewOffset - ( std::integral_constant const & padding_type_size + ( std::integral_constant const & , Kokkos::LayoutLeft const & arg_layout ) : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1] @@ -1741,7 +1741,7 @@ public: template< unsigned TrivialScalarSize > KOKKOS_INLINE_FUNCTION constexpr ViewOffset - ( std::integral_constant const & padding_type_size + ( std::integral_constant const & , Kokkos::LayoutRight const & arg_layout ) : m_dim( arg_layout.dimension[0] , arg_layout.dimension[1] @@ -2368,7 +2368,7 @@ struct ViewDataHandle< Traits , )>::type > { typedef typename Traits::value_type value_type ; - typedef typename Traits::value_type * KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ; + typedef typename Traits::value_type * KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ; typedef typename Traits::value_type & return_type ; typedef Kokkos::Impl::SharedAllocationTracker track_type ; @@ -2376,7 +2376,7 @@ struct ViewDataHandle< Traits , static handle_type assign( value_type * arg_data_ptr , track_type const & /*arg_tracker*/ ) { - if ( reinterpret_cast(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) { + if ( reinterpret_cast(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) { Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute"); } return handle_type( arg_data_ptr ); @@ -2386,7 +2386,7 @@ struct ViewDataHandle< Traits , static handle_type assign( handle_type const arg_data_ptr , size_t offset ) { - if ( reinterpret_cast(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) { + if ( reinterpret_cast(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) { Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute"); } return handle_type( arg_data_ptr + offset ); @@ -2411,7 +2411,7 @@ struct ViewDataHandle< Traits , )>::type > { typedef typename Traits::value_type value_type ; - typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_ALIGN_PTR(KOKKOS_ALIGN_SIZE) handle_type ; + typedef typename Traits::value_type * KOKKOS_RESTRICT KOKKOS_IMPL_ALIGN_PTR(KOKKOS_MEMORY_ALIGNMENT) handle_type ; typedef typename Traits::value_type & return_type ; typedef Kokkos::Impl::SharedAllocationTracker track_type ; @@ -2419,7 +2419,7 @@ struct ViewDataHandle< Traits , static handle_type assign( value_type * arg_data_ptr , track_type const & /*arg_tracker*/ ) { - if ( reinterpret_cast(arg_data_ptr) % KOKKOS_ALIGN_SIZE ) { + if ( reinterpret_cast(arg_data_ptr) % Impl::MEMORY_ALIGNMENT ) { Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute"); } return handle_type( arg_data_ptr ); @@ -2429,7 +2429,7 @@ struct ViewDataHandle< Traits , static handle_type assign( handle_type const arg_data_ptr , size_t offset ) { - if ( reinterpret_cast(arg_data_ptr+offset) % KOKKOS_ALIGN_SIZE ) { + if ( reinterpret_cast(arg_data_ptr+offset) % Impl::MEMORY_ALIGNMENT ) { Kokkos::abort("Assigning NonAligned View or Pointer to Kokkos::View with Aligned attribute"); } return handle_type( arg_data_ptr + offset ); @@ -2783,6 +2783,11 @@ public: , m_offset( std::integral_constant< unsigned , 0 >() , arg_layout ) {} + /**\brief Assign data */ + KOKKOS_INLINE_FUNCTION + void assign_data( pointer_type arg_ptr ) + { m_handle = handle_type( arg_ptr ); } + //---------------------------------------- /* Allocate and construct mapped array. * Allocate via shared allocation record and diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp index 37367f68e4..54d061a503 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ViewTile.hpp @@ -144,9 +144,9 @@ public: //---------------------------------------- KOKKOS_FUNCTION_DEFAULTED ~ViewOffset() = default ; - KOKKOS_INLINE_FUNCTION ViewOffset() = default ; - KOKKOS_INLINE_FUNCTION ViewOffset( const ViewOffset & ) = default ; - KOKKOS_INLINE_FUNCTION ViewOffset & operator = ( const ViewOffset & ) = default ; + KOKKOS_FUNCTION_DEFAULTED ViewOffset() = default ; + KOKKOS_FUNCTION_DEFAULTED ViewOffset( const ViewOffset & ) = default ; + KOKKOS_FUNCTION_DEFAULTED ViewOffset & operator = ( const ViewOffset & ) = default ; template< unsigned TrivialScalarSize > KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp index 55d7651eca..b8de8674d4 100644 --- a/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_hwloc.cpp @@ -48,6 +48,7 @@ #include #include +#include #include #include @@ -312,14 +313,18 @@ Sentinel::Sentinel() hwloc_get_cpubind( s_hwloc_topology , s_process_binding , HWLOC_CPUBIND_PROCESS ); if ( hwloc_bitmap_iszero( s_process_binding ) ) { - std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl; + if (Kokkos::show_warnings() ) { + std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL processing units" << std::endl; + } const int pu_depth = hwloc_get_type_depth( s_hwloc_topology, HWLOC_OBJ_PU ); int num_pu = 1; if ( pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN ) { num_pu = hwloc_get_nbobjs_by_depth( s_hwloc_topology, pu_depth ); } else { - std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl; + if (Kokkos::show_warnings() ) { + std::cerr << "WARNING: Cannot detect number of processing units -- ASSUMING 1 (serial)." << std::endl; + } num_pu = 1; } hwloc_bitmap_set_range( s_process_binding, 0, num_pu-1); @@ -349,7 +354,7 @@ Sentinel::Sentinel() hwloc_bitmap_free( s_process_no_core_zero ); - if ( ! ok ) { + if ( Kokkos::show_warnings() && ! ok ) { std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move process off of core #0" << std::endl ; } } @@ -503,8 +508,8 @@ Sentinel::Sentinel() hwloc_bitmap_free( proc_cpuset_location ); - if ( ! symmetric ) { - std::cout << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology." + if ( Kokkos::show_warnings() && ! symmetric ) { + std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a non-symmetric core topology." << std::endl ; } } diff --git a/lib/kokkos/core/unit_test/CMakeLists.txt b/lib/kokkos/core/unit_test/CMakeLists.txt index 8aeae1199f..83d617b9a6 100644 --- a/lib/kokkos/core/unit_test/CMakeLists.txt +++ b/lib/kokkos/core/unit_test/CMakeLists.txt @@ -2,7 +2,16 @@ # Add test-only library for gtest to be reused by all the subpackages # +IF(NOT KOKKOS_HAS_TRILINOS) + IF(KOKKOS_SEPARATE_LIBS) + set(TEST_LINK_TARGETS kokkoscore) + ELSE() + set(TEST_LINK_TARGETS kokkos) + ENDIF() +ENDIF() + SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) +SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGTEST_HAS_PTHREAD=0") INCLUDE_DIRECTORIES(${GTEST_SOURCE_DIR}) TRIBITS_ADD_LIBRARY( @@ -49,6 +58,7 @@ IF(Kokkos_ENABLE_Serial) serial/TestSerial_SubView_c10.cpp serial/TestSerial_SubView_c11.cpp serial/TestSerial_SubView_c12.cpp + serial/TestSerial_SubView_c13.cpp serial/TestSerial_Team.cpp serial/TestSerial_TeamReductionScan.cpp serial/TestSerial_TeamScratch.cpp @@ -62,7 +72,7 @@ IF(Kokkos_ENABLE_Serial) COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -96,6 +106,7 @@ IF(Kokkos_ENABLE_Pthread) threads/TestThreads_SubView_c10.cpp threads/TestThreads_SubView_c11.cpp threads/TestThreads_SubView_c12.cpp + threads/TestThreads_SubView_c13.cpp threads/TestThreads_Team.cpp threads/TestThreads_TeamReductionScan.cpp threads/TestThreads_TeamScratch.cpp @@ -109,7 +120,7 @@ IF(Kokkos_ENABLE_Pthread) COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -143,6 +154,7 @@ IF(Kokkos_ENABLE_OpenMP) openmp/TestOpenMP_SubView_c10.cpp openmp/TestOpenMP_SubView_c11.cpp openmp/TestOpenMP_SubView_c12.cpp + openmp/TestOpenMP_SubView_c13.cpp openmp/TestOpenMP_Task.cpp openmp/TestOpenMP_Team.cpp openmp/TestOpenMP_TeamReductionScan.cpp @@ -157,7 +169,7 @@ IF(Kokkos_ENABLE_OpenMP) COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -184,13 +196,14 @@ IF(Kokkos_ENABLE_Qthreads) qthreads/TestQthreads_SubView_c10.cpp qthreads/TestQthreads_SubView_c11.cpp qthreads/TestQthreads_SubView_c12.cpp + qthreads/TestQthreads_SubView_c13.cpp qthreads/TestQthreads_Team.cpp qthreads/TestQthreads_ViewAPI_a.cpp qthreads/TestQthreads_ViewAPI_b.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -235,6 +248,7 @@ IF(Kokkos_ENABLE_Cuda) cuda/TestCuda_SubView_c10.cpp cuda/TestCuda_SubView_c11.cpp cuda/TestCuda_SubView_c12.cpp + cuda/TestCuda_SubView_c13.cpp cuda/TestCuda_Task.cpp cuda/TestCuda_Team.cpp cuda/TestCuda_TeamReductionScan.cpp @@ -246,10 +260,11 @@ IF(Kokkos_ENABLE_Cuda) cuda/TestCuda_ViewOfClass.cpp cuda/TestCuda_Crs.cpp cuda/TestCuda_WorkGraph.cpp + cuda/TestCuda_UniqueToken.cpp COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) ENDIF() @@ -266,7 +281,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) foreach(INITTESTS_NUM RANGE 1 16) @@ -276,7 +291,7 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) endforeach(INITTESTS_NUM) @@ -286,5 +301,5 @@ TRIBITS_ADD_EXECUTABLE_AND_TEST( COMM serial mpi NUM_MPI_PROCS 1 FAIL_REGULAR_EXPRESSION " FAILED " - TESTONLYLIBS kokkos_gtest + TESTONLYLIBS kokkos_gtest ${TEST_LINK_TARGETS} ) diff --git a/lib/kokkos/core/unit_test/Makefile b/lib/kokkos/core/unit_test/Makefile index 07859f7ac3..ace6181ddf 100644 --- a/lib/kokkos/core/unit_test/Makefile +++ b/lib/kokkos/core/unit_test/Makefile @@ -27,7 +27,8 @@ endif CXXFLAGS = -O3 LINK ?= $(CXX) -LDFLAGS ?= -lpthread +LDFLAGS ?= +override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos @@ -56,6 +57,7 @@ else OBJ_CUDA += TestCuda_SubView_c04.o TestCuda_SubView_c05.o TestCuda_SubView_c06.o OBJ_CUDA += TestCuda_SubView_c07.o TestCuda_SubView_c08.o TestCuda_SubView_c09.o OBJ_CUDA += TestCuda_SubView_c10.o TestCuda_SubView_c11.o TestCuda_SubView_c12.o + OBJ_CUDA += TestCuda_SubView_c13.o endif OBJ_CUDA += TestCuda_Reductions.o TestCuda_Scan.o OBJ_CUDA += TestCuda_Complex.o @@ -169,6 +171,7 @@ else OBJ_OPENMP += TestOpenMP_SubView_c04.o TestOpenMP_SubView_c05.o TestOpenMP_SubView_c06.o OBJ_OPENMP += TestOpenMP_SubView_c07.o TestOpenMP_SubView_c08.o TestOpenMP_SubView_c09.o OBJ_OPENMP += TestOpenMP_SubView_c10.o TestOpenMP_SubView_c11.o TestOpenMP_SubView_c12.o + OBJ_OPENMP += TestOpenMP_SubView_c13.o endif OBJ_OPENMP += TestOpenMP_Reductions.o TestOpenMP_Scan.o OBJ_OPENMP += TestOpenMP_Complex.o @@ -258,6 +261,7 @@ else OBJ_SERIAL += TestSerial_SubView_c04.o TestSerial_SubView_c05.o TestSerial_SubView_c06.o OBJ_SERIAL += TestSerial_SubView_c07.o TestSerial_SubView_c08.o TestSerial_SubView_c09.o OBJ_SERIAL += TestSerial_SubView_c10.o TestSerial_SubView_c11.o TestSerial_SubView_c12.o + OBJ_SERIAL += TestSerial_SubView_c13.o endif OBJ_SERIAL += TestSerial_Reductions.o TestSerial_Scan.o OBJ_SERIAL += TestSerial_Complex.o @@ -326,7 +330,7 @@ KokkosCore_UnitTest_HWLOC: $(OBJ_HWLOC) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_HWLOC) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_HWLOC KokkosCore_UnitTest_AllocationTracker: $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $( $(KOKKOS_LDFLAGS) $(LDFLAGS)LIB) -o KokkosCore_UnitTest_AllocationTracker + $(LINK) $(EXTRA_PATH) $(OBJ_ALLOCATIONTRACKER) $(KOKKOS_LIBS) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(LIB) -o KokkosCore_UnitTest_AllocationTracker KokkosCore_UnitTest_Default: $(OBJ_DEFAULT) $(KOKKOS_LINK_DEPENDS) $(LINK) $(EXTRA_PATH) $(OBJ_DEFAULT) $(KOKKOS_LIBS) $(LIB) $(KOKKOS_LDFLAGS) $(LDFLAGS) -o KokkosCore_UnitTest_Default diff --git a/lib/kokkos/core/unit_test/TestAtomic.hpp b/lib/kokkos/core/unit_test/TestAtomic.hpp index e61d5e730c..fffe6b4699 100644 --- a/lib/kokkos/core/unit_test/TestAtomic.hpp +++ b/lib/kokkos/core/unit_test/TestAtomic.hpp @@ -467,6 +467,7 @@ TEST_F( TEST_CATEGORY, atomics ) ASSERT_TRUE( ( TestAtomic::Loop< float, TEST_EXECSPACE >( 100, 3 ) ) ); #ifndef KOKKOS_ENABLE_OPENMPTARGET +#ifndef KOKKOS_ENABLE_ROCM ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex, TEST_EXECSPACE >( 100, 1 ) ) ); ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex, TEST_EXECSPACE >( 100, 2 ) ) ); ASSERT_TRUE( ( TestAtomic::Loop< Kokkos::complex, TEST_EXECSPACE >( 100, 3 ) ) ); @@ -475,6 +476,7 @@ TEST_F( TEST_CATEGORY, atomics ) ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 2 ) ) ); ASSERT_TRUE( ( TestAtomic::Loop< TestAtomic::SuperScalar<4>, TEST_EXECSPACE >( 100, 3 ) ) ); #endif +#endif } diff --git a/lib/kokkos/core/unit_test/TestComplex.hpp b/lib/kokkos/core/unit_test/TestComplex.hpp index c7f681699e..b5f7e2b6ec 100644 --- a/lib/kokkos/core/unit_test/TestComplex.hpp +++ b/lib/kokkos/core/unit_test/TestComplex.hpp @@ -71,11 +71,13 @@ struct TestComplexConstruction { ASSERT_FLOAT_EQ(h_results(7).real(),7.5); ASSERT_FLOAT_EQ(h_results(7).imag(),0.0); ASSERT_FLOAT_EQ(h_results(8).real(),double(8)); ASSERT_FLOAT_EQ(h_results(8).imag(),0.0); +#ifndef KOKKOS_ENABLE_ROCM Kokkos::complex a(1.5,2.5),b(3.25,5.25),r_kk; std::complex sa(a),sb(3.25,5.25),r; r = a; r_kk = a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag()); r = sb*a; r_kk = b*a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag()); r = sa; r_kk = a; ASSERT_FLOAT_EQ(r.real(),r_kk.real()); ASSERT_FLOAT_EQ(r.imag(),r_kk.imag()); +#endif } diff --git a/lib/kokkos/core/unit_test/TestCrs.hpp b/lib/kokkos/core/unit_test/TestCrs.hpp index 90f4036868..50811fb8a9 100644 --- a/lib/kokkos/core/unit_test/TestCrs.hpp +++ b/lib/kokkos/core/unit_test/TestCrs.hpp @@ -65,8 +65,8 @@ struct CountFillFunctor { template< class ExecSpace > void test_count_fill(std::int32_t nrows) { - Kokkos::Experimental::Crs graph; - Kokkos::Experimental::count_and_fill_crs(graph, nrows, CountFillFunctor()); + Kokkos::Crs graph; + Kokkos::count_and_fill_crs(graph, nrows, CountFillFunctor()); ASSERT_EQ(graph.numRows(), nrows); auto row_map = Kokkos::create_mirror_view(graph.row_map); Kokkos::deep_copy(row_map, graph.row_map); diff --git a/lib/kokkos/core/unit_test/TestMDRange.hpp b/lib/kokkos/core/unit_test/TestMDRange.hpp index fbc3a65c2f..b84b13be7c 100644 --- a/lib/kokkos/core/unit_test/TestMDRange.hpp +++ b/lib/kokkos/core/unit_test/TestMDRange.hpp @@ -2489,6 +2489,303 @@ struct TestMDRange_6D { } }; + +template +struct TestMDRange_2D_NegIdx { + + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View< DataType**, ExecSpace >; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[2]; + + TestMDRange_2D_NegIdx( const DataType L0, const DataType L1, const DataType N0, const DataType N1 ) : input_view( "input_view", N0 - L0, N1 - L1 ) + { + lower_offset[0] = L0; + lower_offset[1] = L1; + } + + // When using negative indices, must offset View appropriately as views cannot take a negative index + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j ) const + { + input_view( i - lower_offset[0], j - lower_offset[1] ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, value_type &lsum ) const + { + lsum += input_view( i - lower_offset[0], j - lower_offset[1] ) * 2; + } + + static void test_2D_negidx( const int N0, const int N1 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<2>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + const point_type lower{{-1, -1}}; + const point_type upper{{N0, N1}}; + const tile_type tile{{8,8}}; + + range_type range( point_type{{ lower[0], lower[1] }}, point_type{{ upper[0], upper[1] }}, tile_type{{ tile[0], tile[1] }} ); + + TestMDRange_2D_NegIdx functor( lower[0], lower[1], upper[0], upper[1] ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) ); + } + } +}; + +template +struct TestMDRange_3D_NegIdx { + + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View< DataType***, ExecSpace >; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[3]; + + TestMDRange_3D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType N0, const DataType N1, const DataType N2 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2 ) + { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + } + + // When using negative indices, must offset View appropriately as views cannot take a negative index + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k ) const + { + input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, value_type &lsum ) const + { + lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2] ) * 2; + } + + static void test_3D_negidx( const int N0, const int N1, const int N2 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<3>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + const point_type lower{{-1, -1, -1}}; + const point_type upper{{N0, N1, N2}}; + const tile_type tile{{8,8,2}}; + + range_type range( point_type{{ lower[0], lower[1], lower[2] }}, point_type{{ upper[0], upper[1], upper[2] }}, tile_type{{ tile[0], tile[1], tile[2] }} ); + + TestMDRange_3D_NegIdx functor( lower[0], lower[1], lower[2], upper[0], upper[1], upper[2] ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) ); + } + } +}; + +template +struct TestMDRange_4D_NegIdx { + + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View< DataType****, ExecSpace >; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[4]; + + TestMDRange_4D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType N0, const DataType N1, const DataType N2, const DataType N3 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3 ) + { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + } + + // When using negative indices, must offset View appropriately as views cannot take a negative index + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l ) const + { + input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l, value_type &lsum ) const + { + lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3] ) * 2; + } + + static void test_4D_negidx( const int N0, const int N1, const int N2, const int N3 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<4>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + const point_type lower{{-1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3}}; + const tile_type tile{{8,8,2,2}}; + + range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3] }}, point_type{{ upper[0], upper[1], upper[2], upper[3] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3] }} ); + + TestMDRange_4D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], upper[0], upper[1], upper[2], upper[3] ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) ); + } + } +}; + +template +struct TestMDRange_5D_NegIdx { + + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View< DataType*****, ExecSpace >; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[5]; + + TestMDRange_5D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4 ) + { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + lower_offset[4] = L4; + } + + // When using negative indices, must offset View appropriately as views cannot take a negative index + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l, const int m ) const + { + input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l, const int m, value_type &lsum ) const + { + lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4] ) * 2; + } + + static void test_5D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<5>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + const point_type lower{{-1, -1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3, N4}}; + const tile_type tile{{8,4,2,2,2}}; + + range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4] }} ); + + TestMDRange_5D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], upper[0], upper[1], upper[2], upper[3], upper[4] ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) ); + } + } +}; + +template +struct TestMDRange_6D_NegIdx { + + using value_type = double; + + using DataType = int; + using ViewType = typename Kokkos::View< DataType******, ExecSpace >; + using HostViewType = typename ViewType::HostMirror; + + ViewType input_view; + DataType lower_offset[6]; + + TestMDRange_6D_NegIdx( const DataType L0, const DataType L1, const DataType L2, const DataType L3, const DataType L4, const DataType L5, const DataType N0, const DataType N1, const DataType N2, const DataType N3, const DataType N4, const DataType N5 ) : input_view( "input_view", N0 - L0, N1 - L1, N2 - L2, N3 - L3, N4 - L4, N5 - L5 ) + { + lower_offset[0] = L0; + lower_offset[1] = L1; + lower_offset[2] = L2; + lower_offset[3] = L3; + lower_offset[4] = L4; + lower_offset[5] = L5; + } + + // When using negative indices, must offset View appropriately as views cannot take a negative index + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l, const int m, const int n ) const + { + input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) = 1; + } + + KOKKOS_INLINE_FUNCTION + void operator()( const int i, const int j, const int k, const int l, const int m, const int n, value_type &lsum ) const + { + lsum += input_view( i - lower_offset[0], j - lower_offset[1], k - lower_offset[2], l - lower_offset[3], m - lower_offset[4], n - lower_offset[5] ) * 2; + } + + static void test_6D_negidx( const int N0, const int N1, const int N2, const int N3, const int N4, const int N5 ) + { + using namespace Kokkos::Experimental; + + { + typedef typename Kokkos::Experimental::MDRangePolicy< ExecSpace, Rank<6>, Kokkos::IndexType > range_type; + typedef typename range_type::tile_type tile_type; + typedef typename range_type::point_type point_type; + + const point_type lower{{-1, -1, -1, -1, -1, -1}}; + const point_type upper{{N0, N1, N2, N3, N4, N5}}; + const tile_type tile{{8,4,2,2,2,1}}; + + range_type range( point_type{{ lower[0], lower[1], lower[2], lower[3], lower[4], lower[5] }}, point_type{{ upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] }}, tile_type{{ tile[0], tile[1], tile[2], tile[3], tile[4], tile[5] }} ); + + TestMDRange_6D_NegIdx functor( lower[0], lower[1], lower[2], lower[3], lower[4], lower[5], upper[0], upper[1], upper[2], upper[3], upper[4], upper[5] ); + + parallel_for( range, functor ); + double sum = 0.0; + parallel_reduce( range, functor, sum ); + + ASSERT_EQ( sum, 2 * (upper[0] - lower[0]) * (upper[1] - lower[1]) * (upper[2] - lower[2]) * (upper[3] - lower[3]) * (upper[4] - lower[4]) * (upper[5] - lower[5]) ); + } + } +}; + + } // namespace TEST_F( TEST_CATEGORY , mdrange_for ) { @@ -2512,6 +2809,14 @@ TEST_F( TEST_CATEGORY , mdrange_array_reduce ) { TestMDRange_ReduceArray_2D< TEST_EXECSPACE >::test_arrayreduce2( 4, 5 ); TestMDRange_ReduceArray_3D< TEST_EXECSPACE >::test_arrayreduce3( 4, 5, 10 ); } + +TEST_F( TEST_CATEGORY , mdrange_neg_idx ) { + TestMDRange_2D_NegIdx< TEST_EXECSPACE >::test_2D_negidx( 128, 32 ); + TestMDRange_3D_NegIdx< TEST_EXECSPACE >::test_3D_negidx( 128, 32, 8 ); + TestMDRange_4D_NegIdx< TEST_EXECSPACE >::test_4D_negidx( 128, 32, 8, 8 ); + TestMDRange_5D_NegIdx< TEST_EXECSPACE >::test_5D_negidx( 128, 32, 8, 8, 4 ); + TestMDRange_6D_NegIdx< TEST_EXECSPACE >::test_6D_negidx( 128, 32, 8, 8, 4, 2 ); +} //#endif } // namespace Test diff --git a/lib/kokkos/core/unit_test/TestMemoryPool.hpp b/lib/kokkos/core/unit_test/TestMemoryPool.hpp index 9f708390c2..8034ae4ca0 100644 --- a/lib/kokkos/core/unit_test/TestMemoryPool.hpp +++ b/lib/kokkos/core/unit_test/TestMemoryPool.hpp @@ -521,6 +521,101 @@ void test_memory_pool_corners( const bool print_statistics //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +template< class DeviceType , class Enable = void > +struct TestMemoryPoolHuge +{ + TestMemoryPoolHuge() {} + + enum : size_t { num_superblock = 0 }; + + using value_type = long ; + + KOKKOS_INLINE_FUNCTION + void operator()( int i , long & err ) const noexcept {} + + KOKKOS_INLINE_FUNCTION + void operator()( int i ) const noexcept {} +}; + +template< class DeviceType > +struct TestMemoryPoolHuge< DeviceType + , typename std::enable_if< + std::is_same< Kokkos::HostSpace + , typename DeviceType::memory_space > + ::value >::type + > +{ + typedef Kokkos::View< uintptr_t * , DeviceType > ptrs_type ; + typedef Kokkos::MemoryPool< DeviceType > pool_type ; + typedef typename DeviceType::memory_space memory_space ; + + pool_type pool ; + ptrs_type ptrs ; + + enum : size_t { min_block_size = 512 + , max_block_size = 1lu << 31 + , min_superblock_size = max_block_size + , num_superblock = 4 + , total_alloc_size = num_superblock * max_block_size }; + + TestMemoryPoolHuge() + : pool( memory_space() + , total_alloc_size + , min_block_size + , max_block_size + , min_superblock_size ) + , ptrs( "ptrs" , num_superblock ) + {} + + // Specify reduction argument value_type to + // avoid confusion with tag-dispatch. + + using value_type = long ; + + void operator()( int i , long & err ) const noexcept + { + if ( i < int(num_superblock) ) { + ptrs(i) = (uintptr_t) pool.allocate( max_block_size ); +#if 0 + printf("TestMemoryPoolHuge size(0x%lx) ptr(0x%lx)\n" + , max_block_size + , ptrs(i) ); +#endif + if ( ! ptrs(i) ) { + Kokkos::abort("TestMemoryPoolHuge"); + ++err ; + } + } + } + + void operator()( int i ) const noexcept + { + if ( i < int(num_superblock) ) { + pool.deallocate( (void*) ptrs(i) , max_block_size ); + ptrs(i) = 0 ; + } + } +}; + +template< class DeviceType > +void test_memory_pool_huge() +{ + typedef typename DeviceType::execution_space execution_space ; + typedef TestMemoryPoolHuge< DeviceType > functor_type ; + typedef Kokkos::RangePolicy< execution_space > policy_type ; + + functor_type f ; + policy_type policy( 0 , functor_type::num_superblock ); + + long err = 0 ; + + Kokkos::parallel_reduce( policy , f , err ); + Kokkos::parallel_for( policy , f ); +} + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + } // namespace TestMemoryPool namespace Test { @@ -531,6 +626,7 @@ TEST_F( TEST_CATEGORY, memory_pool ) TestMemoryPool::test_host_memory_pool_stats<>(); TestMemoryPool::test_memory_pool_v2< TEST_EXECSPACE >(false,false); TestMemoryPool::test_memory_pool_corners< TEST_EXECSPACE >(false,false); + TestMemoryPool::test_memory_pool_huge< TEST_EXECSPACE >(); } } diff --git a/lib/kokkos/core/unit_test/TestReduce.hpp b/lib/kokkos/core/unit_test/TestReduce.hpp index 86982e6a55..56d6259b5a 100644 --- a/lib/kokkos/core/unit_test/TestReduce.hpp +++ b/lib/kokkos/core/unit_test/TestReduce.hpp @@ -475,6 +475,8 @@ public: namespace Test { +struct ReducerTag {}; + template< class Scalar, class ExecSpace = Kokkos::DefaultExecutionSpace > struct TestReducers { struct SumFunctor { @@ -590,6 +592,118 @@ struct TestReducers { } }; + struct SumFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value += values( i ); + } + }; + + struct ProdFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value *= values( i ); + } + }; + + struct MinFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + if ( values( i ) < value ) value = values( i ); + } + }; + + struct MaxFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + if ( values( i ) > value ) value = values( i ); + } + }; + + struct MinLocFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MinLoc< Scalar, int >::value_type & value ) const { + if ( values( i ) < value.val ) { + value.val = values( i ); + value.loc = i; + } + } + }; + + struct MaxLocFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MaxLoc< Scalar, int >::value_type & value ) const { + if ( values( i ) > value.val ) { + value.val = values( i ); + value.loc = i; + } + } + }; + + struct MinMaxLocFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, typename Kokkos::Experimental::MinMaxLoc< Scalar, int >::value_type & value ) const { + if ( values( i ) > value.max_val ) { + value.max_val = values( i ); + value.max_loc = i; + } + + if ( values( i ) < value.min_val ) { + value.min_val = values( i ); + value.min_loc = i; + } + } + }; + + struct BAndFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value = value & values( i ); + } + }; + + struct BOrFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value = value | values( i ); + } + }; + + struct LAndFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value = value && values( i ); + } + }; + + struct LOrFunctorTag { + Kokkos::View< const Scalar*, ExecSpace > values; + + KOKKOS_INLINE_FUNCTION + void operator()( const ReducerTag, const int & i, Scalar & value ) const { + value = value || values( i ); + } + }; static void test_sum( int N ) { Kokkos::View< Scalar*, ExecSpace > values( "Values", N ); auto h_values = Kokkos::create_mirror_view( values ); @@ -603,13 +717,19 @@ struct TestReducers { SumFunctor f; f.values = values; + SumFunctorTag f_tag; + f_tag.values = values; Scalar init = 0; { Scalar sum_scalar = init; Kokkos::Experimental::Sum< Scalar > reducer_scalar( sum_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); - + ASSERT_EQ( sum_scalar, reference_sum ); + + sum_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( sum_scalar, reference_sum ); Scalar sum_scalar_view = reducer_scalar.reference(); @@ -643,13 +763,19 @@ struct TestReducers { ProdFunctor f; f.values = values; + ProdFunctorTag f_tag; + f_tag.values = values; Scalar init = 1; { Scalar prod_scalar = init; Kokkos::Experimental::Prod< Scalar > reducer_scalar( prod_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); - + ASSERT_EQ( prod_scalar, reference_prod ); + + prod_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( prod_scalar, reference_prod ); Scalar prod_scalar_view = reducer_scalar.reference(); @@ -684,13 +810,19 @@ struct TestReducers { MinFunctor f; f.values = values; + MinFunctorTag f_tag; + f_tag.values = values; Scalar init = std::numeric_limits< Scalar >::max(); { Scalar min_scalar = init; Kokkos::Experimental::Min< Scalar > reducer_scalar( min_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( min_scalar, reference_min ); + min_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( min_scalar, reference_min ); Scalar min_scalar_view = reducer_scalar.reference(); @@ -725,13 +857,19 @@ struct TestReducers { MaxFunctor f; f.values = values; + MaxFunctorTag f_tag; + f_tag.values = values; Scalar init = std::numeric_limits< Scalar >::min(); { Scalar max_scalar = init; Kokkos::Experimental::Max< Scalar > reducer_scalar( max_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( max_scalar, reference_max ); + + max_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( max_scalar, reference_max ); Scalar max_scalar_view = reducer_scalar.reference(); @@ -776,12 +914,19 @@ struct TestReducers { MinLocFunctor f; f.values = values; + MinLocFunctorTag f_tag; + f_tag.values = values; { value_type min_scalar; Kokkos::Experimental::MinLoc< Scalar, int > reducer_scalar( min_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( min_scalar.val, reference_min ); + ASSERT_EQ( min_scalar.loc, reference_loc ); + + min_scalar = value_type(); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( min_scalar.val, reference_min ); ASSERT_EQ( min_scalar.loc, reference_loc ); @@ -829,12 +974,19 @@ struct TestReducers { MaxLocFunctor f; f.values = values; + MaxLocFunctorTag f_tag; + f_tag.values = values; { value_type max_scalar; Kokkos::Experimental::MaxLoc< Scalar, int > reducer_scalar( max_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( max_scalar.val, reference_max ); + ASSERT_EQ( max_scalar.loc, reference_loc ); + + max_scalar = value_type(); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( max_scalar.val, reference_max ); ASSERT_EQ( max_scalar.loc, reference_loc ); @@ -898,12 +1050,35 @@ struct TestReducers { MinMaxLocFunctor f; f.values = values; + MinMaxLocFunctorTag f_tag; + f_tag.values = values; { value_type minmax_scalar; Kokkos::Experimental::MinMaxLoc< Scalar, int > reducer_scalar( minmax_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( minmax_scalar.min_val, reference_min ); + + for ( int i = 0; i < N; i++ ) { + if ( ( i == minmax_scalar.min_loc ) && ( h_values( i ) == reference_min ) ) { + reference_minloc = i; + } + } + + ASSERT_EQ( minmax_scalar.min_loc, reference_minloc ); + ASSERT_EQ( minmax_scalar.max_val, reference_max ); + + for ( int i = 0; i < N; i++ ) { + if ( ( i == minmax_scalar.max_loc ) && ( h_values( i ) == reference_max ) ) { + reference_maxloc = i; + } + } + + ASSERT_EQ( minmax_scalar.max_loc, reference_maxloc ); + + minmax_scalar = value_type(); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( minmax_scalar.min_val, reference_min ); for ( int i = 0; i < N; i++ ) { @@ -962,14 +1137,21 @@ struct TestReducers { BAndFunctor f; f.values = values; + BAndFunctorTag f_tag; + f_tag.values = values; Scalar init = Scalar() | ( ~Scalar() ); { Scalar band_scalar = init; Kokkos::Experimental::BAnd< Scalar > reducer_scalar( band_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); ASSERT_EQ( band_scalar, reference_band ); + + band_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); + ASSERT_EQ( band_scalar, reference_band ); + Scalar band_scalar_view = reducer_scalar.reference(); ASSERT_EQ( band_scalar_view, reference_band ); @@ -1002,13 +1184,19 @@ struct TestReducers { BOrFunctor f; f.values = values; + BOrFunctorTag f_tag; + f_tag.values = values; Scalar init = Scalar() & ( ~Scalar() ); { Scalar bor_scalar = init; Kokkos::Experimental::BOr< Scalar > reducer_scalar( bor_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( bor_scalar, reference_bor ); + + bor_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( bor_scalar, reference_bor ); Scalar bor_scalar_view = reducer_scalar.reference(); @@ -1042,13 +1230,19 @@ struct TestReducers { LAndFunctor f; f.values = values; + LAndFunctorTag f_tag; + f_tag.values = values; Scalar init = 1; { Scalar land_scalar = init; Kokkos::Experimental::LAnd< Scalar > reducer_scalar( land_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( land_scalar, reference_land ); + + land_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( land_scalar, reference_land ); Scalar land_scalar_view = reducer_scalar.reference(); @@ -1082,13 +1276,19 @@ struct TestReducers { LOrFunctor f; f.values = values; + LOrFunctorTag f_tag; + f_tag.values = values; Scalar init = 0; { Scalar lor_scalar = init; Kokkos::Experimental::LOr< Scalar > reducer_scalar( lor_scalar ); - Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace >( 0, N ), f, reducer_scalar ); + ASSERT_EQ( lor_scalar, reference_lor ); + + lor_scalar = init; + Kokkos::parallel_reduce( Kokkos::RangePolicy< ExecSpace , ReducerTag >( 0, N ), f_tag, reducer_scalar ); ASSERT_EQ( lor_scalar, reference_lor ); Scalar lor_scalar_view = reducer_scalar.reference(); diff --git a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp index a3f59a2b9e..2ababe6a49 100644 --- a/lib/kokkos/core/unit_test/TestTaskScheduler.hpp +++ b/lib/kokkos/core/unit_test/TestTaskScheduler.hpp @@ -141,13 +141,13 @@ struct TestFib enum { MinBlockSize = 64 }; enum { MaxBlockSize = 1024 }; - enum { SuperBlockSize = 1u << 12 }; + enum { SuperBlockSize = 4096 }; sched_type root_sched( memory_space() , MemoryCapacity , MinBlockSize - , MaxBlockSize - , SuperBlockSize ); + , std::min(size_t(MaxBlockSize),MemoryCapacity) + , std::min(size_t(SuperBlockSize),MemoryCapacity) ); future_type f = Kokkos::host_spawn( Kokkos::TaskSingle( root_sched ) , TestFib( root_sched, i ) ); @@ -205,11 +205,10 @@ struct TestTaskSpawn { { typedef typename sched_type::memory_space memory_space; - // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool. enum { MemoryCapacity = 16000 }; enum { MinBlockSize = 64 }; enum { MaxBlockSize = 1024 }; - enum { SuperBlockSize = 1u << 12 }; + enum { SuperBlockSize = 4096 }; sched_type sched( memory_space() , MemoryCapacity @@ -277,11 +276,10 @@ struct TestTaskDependence { { typedef typename sched_type::memory_space memory_space; - // enum { MemoryCapacity = 4000 }; // Triggers infinite loop in memory pool. enum { MemoryCapacity = 16000 }; enum { MinBlockSize = 64 }; enum { MaxBlockSize = 1024 }; - enum { SuperBlockSize = 1u << 12 }; + enum { SuperBlockSize = 4096 }; sched_type sched( memory_space() , MemoryCapacity @@ -471,13 +469,11 @@ struct TestTaskTeam { static void run( long n ) { - //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop. - //const unsigned memory_capacity = 100000; // Fails with SPAN=1 for serial and OMP. const unsigned memory_capacity = 400000; enum { MinBlockSize = 64 }; enum { MaxBlockSize = 1024 }; - enum { SuperBlockSize = 1u << 12 }; + enum { SuperBlockSize = 4096 }; sched_type root_sched( typename sched_type::memory_space() , memory_capacity @@ -600,12 +596,11 @@ struct TestTaskTeamValue { static void run( long n ) { - //const unsigned memory_capacity = 10000; // Causes memory pool infinite loop. const unsigned memory_capacity = 100000; enum { MinBlockSize = 64 }; enum { MaxBlockSize = 1024 }; - enum { SuperBlockSize = 1u << 12 }; + enum { SuperBlockSize = 4096 }; sched_type root_sched( typename sched_type::memory_space() , memory_capacity @@ -655,7 +650,6 @@ TEST_F( TEST_CATEGORY, task_fib ) TEST_F( TEST_CATEGORY, task_depend ) { for ( int i = 0; i < 25; ++i ) { -printf("\nTest::task_depend %d\n",i); TestTaskScheduler::TestTaskDependence< TEST_EXECSPACE >::run( i ); } } diff --git a/lib/kokkos/core/unit_test/TestTeamVector.hpp b/lib/kokkos/core/unit_test/TestTeamVector.hpp index 7f4663d0f9..be048b19e4 100644 --- a/lib/kokkos/core/unit_test/TestTeamVector.hpp +++ b/lib/kokkos/core/unit_test/TestTeamVector.hpp @@ -46,6 +46,7 @@ #include #include #include +#include namespace TestTeamVector { @@ -840,7 +841,8 @@ public: const ScalarType solution = (ScalarType) nrows * (ScalarType) ncols; if ( int64_t(solution) != int64_t(result) ) { - printf( " TestTripleNestedReduce failed solution(%ld) != result(%ld), nrows(%d) ncols(%d) league_size(%d) team_size(%d)\n" + printf( " TestTripleNestedReduce failed solution(%" PRId64 ") != result(%" PRId64 ")," + " nrows(%" PRId32 ") ncols(%" PRId32 ") league_size(%" PRId32 ") team_size(%" PRId32 ")\n" , int64_t(solution) , int64_t(result) , int32_t(nrows) diff --git a/lib/kokkos/core/unit_test/TestViewAPI.hpp b/lib/kokkos/core/unit_test/TestViewAPI.hpp index 721ffd8378..0f5650d284 100644 --- a/lib/kokkos/core/unit_test/TestViewAPI.hpp +++ b/lib/kokkos/core/unit_test/TestViewAPI.hpp @@ -786,11 +786,40 @@ struct TestViewMirror ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() ); } + template< class MemoryTraits > + void static test_mirror_copy() { + Kokkos::View< double*, Layout, Kokkos::HostSpace > a_org( "A", 10 ); + a_org(5) = 42.0; + Kokkos::View< double*, Layout, Kokkos::HostSpace, MemoryTraits > a_h = a_org; + auto a_h2 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_h ); + auto a_d = Kokkos::create_mirror_view_and_copy( DeviceType(), a_h ); + auto a_h3 = Kokkos::create_mirror_view_and_copy( Kokkos::HostSpace(), a_d ); + + int equal_ptr_h_h2 = a_h.data() == a_h2.data() ? 1 : 0; + int equal_ptr_h_d = a_h.data() == a_d.data() ? 1 : 0; + int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; + int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; + + int is_same_memspace = std::is_same< Kokkos::HostSpace, typename DeviceType::memory_space >::value ? 1 : 0; + ASSERT_EQ( equal_ptr_h_h2, 1 ); + ASSERT_EQ( equal_ptr_h_d, is_same_memspace ); + ASSERT_EQ( equal_ptr_h2_d, is_same_memspace ); + ASSERT_EQ( equal_ptr_h3_d, is_same_memspace ); + + ASSERT_EQ( a_h.dimension_0(), a_h3.dimension_0() ); + ASSERT_EQ( a_h.dimension_0(), a_h2.dimension_0() ); + ASSERT_EQ( a_h.dimension_0(), a_d .dimension_0() ); + ASSERT_EQ( a_org(5), a_h3(5) ); + } + + void static testit() { test_mirror< Kokkos::MemoryTraits<0> >(); test_mirror< Kokkos::MemoryTraits >(); test_mirror_view< Kokkos::MemoryTraits<0> >(); test_mirror_view< Kokkos::MemoryTraits >(); + test_mirror_copy< Kokkos::MemoryTraits<0> >(); + test_mirror_copy< Kokkos::MemoryTraits >(); } }; @@ -1312,10 +1341,12 @@ return; } }; +#if !defined(KOKKOS_ENABLE_ROCM) TEST_F( TEST_CATEGORY, view_api ) { TestViewAPI< double, TEST_EXECSPACE >(); } +#endif TEST_F( TEST_CATEGORY, view_remap ) { diff --git a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp index 219a4d1f20..d6db548665 100644 --- a/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp +++ b/lib/kokkos/core/unit_test/TestViewMapping_subview.hpp @@ -79,14 +79,18 @@ struct TestViewMappingSubview typedef Kokkos::View< int***[13][14], Kokkos::LayoutLeft, ExecSpace > DLT; typedef Kokkos::Subview< DLT, range, int, int, int, int > DLS1; + #if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND) static_assert( DLS1::rank == 1 && std::is_same< typename DLS1::array_layout, Kokkos::LayoutLeft >::value , "Subview layout error for rank 1 subview of left-most range of LayoutLeft" ); + #endif typedef Kokkos::View< int***[13][14], Kokkos::LayoutRight, ExecSpace > DRT; typedef Kokkos::Subview< DRT, int, int, int, int, range > DRS1; + #if !defined(KOKKOS_IMPL_CUDA_VERSION_9_WORKAROUND) static_assert( DRS1::rank == 1 && std::is_same< typename DRS1::array_layout, Kokkos::LayoutRight >::value , "Subview layout error for rank 1 subview of right-most range of LayoutRight" ); + #endif AT Aa; AS Ab; diff --git a/lib/kokkos/core/unit_test/TestViewSubview.hpp b/lib/kokkos/core/unit_test/TestViewSubview.hpp index 106323492a..e4ea090e80 100644 --- a/lib/kokkos/core/unit_test/TestViewSubview.hpp +++ b/lib/kokkos/core/unit_test/TestViewSubview.hpp @@ -1294,5 +1294,41 @@ void test_layoutright_to_layoutright() { } } +//---------------------------------------------------------------------------- + +template< class Space > +struct TestUnmanagedSubviewReset +{ + Kokkos::View a ; + + KOKKOS_INLINE_FUNCTION + void operator()( int ) const noexcept + { + auto sub_a = Kokkos::subview(a,0,Kokkos::ALL,Kokkos::ALL,Kokkos::ALL); + + for ( int i = 0 ; i < int(a.dimension(0)) ; ++i ) { + sub_a.assign_data( & a(i,0,0,0) ); + if ( & sub_a(1,1,1) != & a(i,1,1,1) ) { + Kokkos::abort("TestUnmanagedSubviewReset"); + } + } + } + + TestUnmanagedSubviewReset() + : a( Kokkos::view_alloc() , 20 , 10 , 5 , 2 ) + {} +}; + +template< class Space > +void test_unmanaged_subview_reset() +{ + Kokkos::parallel_for + ( Kokkos::RangePolicy< typename Space::execution_space >(0,1) + , TestUnmanagedSubviewReset() + ); +} + } // namespace TestViewSubview + #endif + diff --git a/lib/kokkos/core/unit_test/TestWorkGraph.hpp b/lib/kokkos/core/unit_test/TestWorkGraph.hpp index 70cf6b47c0..37483a5460 100644 --- a/lib/kokkos/core/unit_test/TestWorkGraph.hpp +++ b/lib/kokkos/core/unit_test/TestWorkGraph.hpp @@ -67,7 +67,7 @@ template< class ExecSpace > struct TestWorkGraph { using MemorySpace = typename ExecSpace::memory_space; - using Policy = Kokkos::Experimental::WorkGraphPolicy; + using Policy = Kokkos::WorkGraphPolicy; using Graph = typename Policy::graph_type; using RowMap = typename Graph::row_map_type; using Entries = typename Graph::entries_type; @@ -117,6 +117,7 @@ struct TestWorkGraph { m_graph.row_map = RowMap("row_map", hg.size() + 1); // row map always has one more m_graph.entries = Entries("entries", hg.size() - 1); // all but the first have a parent m_values = Values("values", hg.size()); + //printf("%zu work items\n", hg.size()); auto h_row_map = Kokkos::create_mirror_view(m_graph.row_map); auto h_entries = Kokkos::create_mirror_view(m_graph.entries); auto h_values = Kokkos::create_mirror_view(m_values); @@ -156,17 +157,15 @@ struct TestWorkGraph { } // anonymous namespace -TEST_F( TEST_CATEGORY, DISABLED_workgraph_fib ) +TEST_F( TEST_CATEGORY, workgraph_fib ) { - #ifdef KOKKOS_IMPL_CUDA_CLANG_WORKAROUND - int limit = 15; - #else int limit = 27; - #endif for ( int i = 0; i < limit; ++i) { TestWorkGraph< TEST_EXECSPACE > f(i); f.test_for(); } + //TestWorkGraph< TEST_EXECSPACE > f(2); + //f.test_for(); } } // namespace Test diff --git a/lib/kokkos/core/unit_test/UnitTestConfig.make b/lib/kokkos/core/unit_test/UnitTestConfig.make new file mode 100644 index 0000000000..97f4af5a8b --- /dev/null +++ b/lib/kokkos/core/unit_test/UnitTestConfig.make @@ -0,0 +1,52 @@ +KOKKOS_PATH = ../.. + +# See $(KOKKOS_PATH)/Makefile.kokkos and $(KOKKOS_PATH)/generate_makefile.bash +KOKKOS_ARCH_OPTIONS="None AMDAVX ARMv80 ARMv81 ARMv8-ThunderX \ + BGQ Power7 Power8 Power9 \ + WSM SNB HSW BDW SKX KNC KNL \ + Kepler Kepler30 Kepler32 Kepler35 Kepler37 \ + Maxwell Maxwell50 Maxwell52 Maxwell53 Pascal60 Pascal61" +#KOKKOS_ARCH_OPTIONS="AMDAVX" + +KOKKOS_DEVICE_OPTIONS="Cuda ROCm OpenMP Pthread Serial Qthreads" +#KOKKOS_DEVICE_OPTIONS="Cuda" + +# Configure paths to enable environment query in Makefile.kokkos to work +ROCM_HCC_PATH="config" +CXX="./config/cxx" +ipath=env CXX=$(CXX) env PATH=./config:$$PATH env ROCM_HCC_PATH=$(ROCM_HCC_PATH) + +# Defined in core/src/Makefile -- this should be consistent +KOKKOS_MAKEFILE=Makefile.kokkos +KOKKOS_CMAKEFILE=kokkos_generated_settings.cmake + +# Defined in Makefile.kokkos -- this should be consistent +KOKKOS_INTERNAL_CONFIG_TMP=KokkosCore_config.tmp +KOKKOS_CONFIG_HEADER=KokkosCore_config.h + +d='\#' + +# diff => 0 is no difference. if => 0 is false +testmake=if test "`testmake.sh $1 $2 $3`" = 'Passed'; then echo OK $d $1; else echo not OK $d $1; fi +testconf=if test "`diffconfig.sh $1`" = 'Passed'; then echo OK $d $1; else echo not OK $d $1; fi + +# testing tmp and cmakefile files is unnecessary here +test: + @for karch in "$(KOKKOS_ARCH_OPTIONS)"; do \ + for device in "$(KOKKOS_DEVICE_OPTIONS)"; do \ + $(ipath) KOKKOS_DEVICES=$$device KOKKOS_ARCH=$$karch make -e -f ../src/Makefile build-makefile-cmake-kokkos; \ + rm -f $(KOKKOS_INTERNAL_CONFIG_TMP) $(KOKKOS_CMAKEFILE); \ + prfx="$$karch"_"$$device"_; \ + newmake="$$prfx"$(KOKKOS_MAKEFILE); \ + newconf="$$prfx"$(KOKKOS_CONFIG_HEADER); \ + mv $(KOKKOS_MAKEFILE) config/tmpstore/$$newmake; \ + mv $(KOKKOS_CONFIG_HEADER) config/tmpstore/$$newconf; \ + $(call testmake,$$newmake,$$karch,$$device); \ + $(call testconf,$$newconf); \ + done; \ + done + +test-cmake: + @cd config/cmaketest; \ + cmake . ; \ + make test diff --git a/lib/kokkos/core/unit_test/config/bin/hcc-config b/lib/kokkos/core/unit_test/config/bin/hcc-config new file mode 100755 index 0000000000..fc09138bcc --- /dev/null +++ b/lib/kokkos/core/unit_test/config/bin/hcc-config @@ -0,0 +1,2 @@ +#!/bin/sh +echo "--foo --bar" diff --git a/lib/kokkos/core/unit_test/config/clang b/lib/kokkos/core/unit_test/config/clang new file mode 100755 index 0000000000..34c6919410 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/clang @@ -0,0 +1,5 @@ +#!/bin/sh +echo="Apple LLVM version 8.1.0 (clang-802.0.42)" +echo="Target: x86_64-apple-darwin16.7.0" +echo="Thread model: posix" +echo="InstalledDir: /Applications/Xcode.app/Contents/Developer/Toolchains/XcodeDefault.xctoolchain/usr/bin" diff --git a/lib/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt b/lib/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt new file mode 100644 index 0000000000..54a4c4a74a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/cmaketest/CMakeLists.txt @@ -0,0 +1,80 @@ +cmake_minimum_required(VERSION 3.1 FATAL_ERROR) +project(Kokkos CXX) + +enable_testing() + +# Initialization +get_filename_component(KOKKOS_TESTDIR ${CMAKE_SOURCE_DIR}/../.. REALPATH) +get_filename_component(KOKKOS_SRCDIR ${CMAKE_SOURCE_DIR}/../../../.. REALPATH) +set(KOKKOS_SRC_PATH ${KOKKOS_SRCDIR}) +set(KOKKOS_PATH ${KOKKOS_SRC_PATH}) + +set(CXX ${KOKKOS_TESTDIR}/config/cxx) + +# Defined in core/src/Makefile -- this should be consistent +set(KOKKOS_MAKEFILE Makefile.kokkos) +set(KOKKOS_CMAKEFILE kokkos_generated_settings.cmake) + +# Defined in Makefile.kokkos -- this should be consistent +set(KOKKOS_INTERNAL_CONFIG_TMP KokkosCore_config.tmp) +set(KOKKOS_CONFIG_HEADER KokkosCore_config.h) + +set(KOKKOS_CMAKE_VERBOSE False) +include(${KOKKOS_SRCDIR}/cmake/kokkos_options.cmake) +foreach(KOKKOS_DEV ${KOKKOS_DEVICES_LIST}) +# Do some initialization: Want to turn everything off for testing + string(TOUPPER ${KOKKOS_DEV} KOKKOS_DEVUC) + set(KOKKOS_ENABLE_${KOKKOS_DEVUC} OFF) +endforeach() + + +#TEST set(KOKKOS_HOST_ARCH_LIST ARMv80) +#TEST set(KOKKOS_DEVICES_LIST Cuda) +#set(KOKKOS_HOST_ARCH_LIST AMDAVX) +#set(KOKKOS_DEVICES_LIST Cuda) + +foreach(KOKKOS_HOST_ARCH ${KOKKOS_HOST_ARCH_LIST}) + foreach(KOKKOS_DEV ${KOKKOS_DEVICES_LIST}) + string(TOUPPER ${KOKKOS_DEV} KOKKOS_DEVUC) + set(KOKKOS_ENABLE_${KOKKOS_DEVUC} On) + + set(KOKKOS_CMAKE_VERBOSE True) + include(${KOKKOS_SRCDIR}/cmake/kokkos_options.cmake) + set(KOKKOS_SETTINGS ${KOKKOS_SETTINGS} ROCM_HCC_PATH=${KOKKOS_TESTDIR}/config) + + #message(STATUS "${KOKKOS_SETTINGS} make -f ${KOKKOS_SRCDIR}/core/src/Makefile build-makefile-cmake-kokkos") + execute_process( + COMMAND ${KOKKOS_SETTINGS} make -f ${KOKKOS_SRCDIR}/core/src/Makefile build-makefile-cmake-kokkos + WORKING_DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}" + OUTPUT_FILE ${CMAKE_BINARY_DIR}/core_src_make.out + RESULT_VARIABLE res + ) + #message(STATUS "RESULT ${res}") + + file(REMOVE ${KOKKOS_INTERNAL_CONFIG_TMP} ${KOKKOS_MAKEFILE}) + set(PREFIX "${KOKKOS_HOST_ARCH}_${KOKKOS_DEV}_") + set(NEWCMAKE ${PREFIX}${KOKKOS_CMAKEFILE}) + set(NEWCONFH ${PREFIX}${KOKKOS_CONFIG_HEADER}) + file(RENAME ${KOKKOS_CMAKEFILE} ${NEWCMAKE}) + file(RENAME ${KOKKOS_CONFIG_HEADER} ${NEWCONFH}) + + add_test(NAME ${NEWCMAKE}-test + COMMAND ${KOKKOS_TESTDIR}/testmake.sh ${NEWCMAKE} ${KOKKOS_HOST_ARCH} ${KOKKOS_DEV} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + set_tests_properties(${NEWCMAKE}-test + PROPERTIES PASS_REGULAR_EXPRESSION Passed + TIMEOUT 15 + ) + add_test(NAME ${NEWCONFH}-test + COMMAND ${KOKKOS_TESTDIR}/diffconfig.sh ${NEWCONFH} + WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} + ) + set_tests_properties(${NEWCONFH}-test + PROPERTIES PASS_REGULAR_EXPRESSION Passed + TIMEOUT 15 + ) + set(KOKKOS_ENABLE_${KOKKOS_DEVUC} Off) + + endforeach() +endforeach() diff --git a/lib/kokkos/core/unit_test/config/cxx b/lib/kokkos/core/unit_test/config/cxx new file mode 100755 index 0000000000..f25d7714a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/cxx @@ -0,0 +1,5 @@ +#!/bin/sh +echo "g++ (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)" +echo "Copyright (C) 2016 Free Software Foundation, Inc." +echo "This is free software; see the source for copying conditions. There is NO" +echo "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." diff --git a/lib/kokkos/core/unit_test/config/mpic++ b/lib/kokkos/core/unit_test/config/mpic++ new file mode 100755 index 0000000000..f25d7714a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/mpic++ @@ -0,0 +1,5 @@ +#!/bin/sh +echo "g++ (GCC) 6.3.1 20161221 (Red Hat 6.3.1-1)" +echo "Copyright (C) 2016 Free Software Foundation, Inc." +echo "This is free software; see the source for copying conditions. There is NO" +echo "warranty; not even for MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE." diff --git a/lib/kokkos/core/unit_test/config/nvcc b/lib/kokkos/core/unit_test/config/nvcc new file mode 100755 index 0000000000..b5bcbf234c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/nvcc @@ -0,0 +1,5 @@ +#!/bin/sh +echo "nvcc: NVIDIA (R) Cuda compiler driver" +echo "Copyright (c) 2005-2016 NVIDIA Corporation" +echo "Built on Tue_Jan_10_13:22:03_CST_2017" +echo "Cuda compilation tools, release 8.0, V8.0.61" diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..1a737a3b2f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_Cuda_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:09 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..7a704e4185 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_OpenMP_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:10 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..c478a5c252 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_Pthread_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:10 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..fb5d214630 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_Qthreads_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:11 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..7b7e2b8153 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_ROCm_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:09 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..9930bacc47 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/AMDAVX_Serial_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:11 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..7f172c00e4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:17 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..d25b832ca2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_OpenMP_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:18 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..cd3a603092 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Pthread_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:19 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..3865bc4a9a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Qthreads_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:20 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..86b9f84585 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_ROCm_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:18 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..75ada8c01f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv8-ThunderX_Serial_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:19 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 +#define KOKKOS_ARCH_ARMV8_THUNDERX 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..796c0aab65 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_Cuda_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:12 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..dcf7ff7ea2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_OpenMP_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:13 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..298966b6d4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_Pthread_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:14 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..7259a9e964 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_Qthreads_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:14 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..c2b4f146cb --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_ROCm_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:12 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..fe5fe66445 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv80_Serial_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:14 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV80 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..3d02142438 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_Cuda_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:15 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..aa194c77be --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_OpenMP_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:16 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..6d2dbeeef4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_Pthread_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:16 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..e9fc71ad9b --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_Qthreads_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:17 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..28a56596b4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_ROCm_KokkosCore_config.h @@ -0,0 +1,18 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:15 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..1d29fd1390 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/ARMv81_Serial_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:16 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_ARMV81 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..ce2582b23f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_Cuda_KokkosCore_config.h @@ -0,0 +1,24 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:37 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..118d1b225f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_OpenMP_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:38 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..6d0215baf6 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_Pthread_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:38 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..e879e7e1fe --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_Qthreads_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:39 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..3f86d055af --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_ROCm_KokkosCore_config.h @@ -0,0 +1,24 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:37 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..fba671ab1a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BDW_Serial_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:39 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..93c74d41e2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_Cuda_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:43 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..533da16028 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:43 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..9524c94f2b --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..60c7ddcdb5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..f5bc1f54a9 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..8372c00699 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/BGQ_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..7bbe9fa84c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:34 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..17f75872f8 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:35 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..5df1be17ad --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:35 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..253dc35bdf --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:36 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..8e04801b86 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:35 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..99f76aff0b --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/HSW_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:36 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX2 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..bdc270fd0d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:42 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..f9b79f552d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:43 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..15d9d01a0a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..5f95a83c27 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:45 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..5991d3065f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:43 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..3a8ddecf14 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNC_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:44 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_KNC +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KNC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..bd7e2ca330 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:45 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..0f567f241c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:46 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..1cf3f0997a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:47 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..6d179d82f8 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:48 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..ae2938e34a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:46 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..21f6e7e434 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/KNL_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:47 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512MIC 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..78e9335e24 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:48 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KEPLER 1 +#define KOKKOS_ARCH_KEPLER30 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..769d9c8789 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:49 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..2cc728a5e3 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:49 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..410ba5ea15 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:50 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..34867aa91e --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:48 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..54943b244f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler30_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:50 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..c7e23d503c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:50 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KEPLER 1 +#define KOKKOS_ARCH_KEPLER32 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..fcfbf97ef2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:51 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..5cea100aa4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:52 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..f42d0cc5f2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:53 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..0ae47b6976 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:51 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..0d20b1dc81 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler32_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:52 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..f7935927c3 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:53 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KEPLER 1 +#define KOKKOS_ARCH_KEPLER35 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..02777df40a --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:54 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..f51f00ce95 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:55 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..429f5e9e28 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:55 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..111bb09340 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:54 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..da61dabb58 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler35_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:55 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..c70ce2e04c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:56 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KEPLER 1 +#define KOKKOS_ARCH_KEPLER37 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..d8c6c74832 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:57 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..b832ef36e5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:58 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..2b8a7f8183 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:59 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..6a661f8842 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:57 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..469f3d96a7 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler37_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:58 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..1ccf1bef54 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:50 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_KEPLER 1 +#define KOKKOS_ARCH_KEPLER35 1 diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..9d87c958a2 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:51 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..263870be9f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:51 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..021d18c002 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:51 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..2826fdfb88 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:52 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..69097e034d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Kepler_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:52 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..fac64e9e98 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:59 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_MAXWELL 1 +#define KOKKOS_ARCH_MAXWELL50 1 diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..3f5b3eea13 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:00 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..b249c88be5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:01 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..be1353365c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:02 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..ce9f67d5be --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:00 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..f8c6be139e --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell50_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:02 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..ce28f3e4b7 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:03 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_MAXWELL 1 +#define KOKKOS_ARCH_MAXWELL52 1 diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..35635063a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:04 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..140740f81f --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:04 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..58a043c6a3 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:05 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..06ff6935ca --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:03 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..eac120d061 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell52_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:05 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..ad8344a099 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:06 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_MAXWELL 1 +#define KOKKOS_ARCH_MAXWELL53 1 diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..ab1e801267 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:06 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..0b1e3bf311 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:07 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..96fdbef3dc --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:08 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..82414cf358 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:06 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..b10b80b3bc --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell53_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:07 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..d81a715007 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:00 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_MAXWELL 1 +#define KOKKOS_ARCH_MAXWELL50 1 diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..98e93c7b28 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:00 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..47a7ccb7a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:00 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..a7f1fd3803 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:01 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..c438f4f7d5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:01 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..d66c569084 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Maxwell_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:20:01 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..6bf2755fd0 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_Cuda_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:22 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..4dd2eed180 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:23 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..1bdd29b6a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:23 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..6bd8addd97 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:23 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..74b0d7335c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:24 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..a9d0b264b8 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/None_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Tue Sep 26 15:19:23 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..8fe1aa698d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:08 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_PASCAL 1 +#define KOKKOS_ARCH_PASCAL60 1 diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..93173f4e11 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:09 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..a05d5729e0 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:09 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..c5a2d1d707 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:10 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..9c04befef5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:09 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..c6038c2965 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal60_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:10 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..0de37df960 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_Cuda_KokkosCore_config.h @@ -0,0 +1,19 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:11 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_PASCAL 1 +#define KOKKOS_ARCH_PASCAL61 1 diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..2c392cc0df --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_OpenMP_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:12 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..f704aa9c81 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_Pthread_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:12 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..958aac11da --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_Qthreads_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:13 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..4a4d8cc683 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_ROCm_KokkosCore_config.h @@ -0,0 +1,17 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:11 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..6fb2cf9e9d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Pascal61_Serial_KokkosCore_config.h @@ -0,0 +1,16 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:23:12 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ diff --git a/lib/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..a78e1ffc8d --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:20 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..bd856b80a5 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:21 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..8b3ac2aff9 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:21 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..dffa8a3f58 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:22 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..e16cfb37bd --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:20 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..6831f3ce25 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power7_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:22 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCBE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER7 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..1ab0b04c6c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:23 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..54750405ca --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:24 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..5d71338d23 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:24 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..9da90f4f7e --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:25 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..f3fd70b0cf --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:24 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..7c0ecc22d3 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power8_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:25 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER8 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..47d518f407 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:26 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..106bf33e44 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:27 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..108e5eba47 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:27 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..5c5be2ed3c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:28 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..8b6a391d95 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:26 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..6f7aefe62e --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/Power9_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:27 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_POWERPCLE +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_POWER9 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..8f4380d992 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_Cuda_KokkosCore_config.h @@ -0,0 +1,24 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:40 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..0a907a2ae1 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_OpenMP_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:40 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..50a95223c9 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_Pthread_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:41 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..2e4b1d61ef --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_Qthreads_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:42 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..12293350a1 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_ROCm_KokkosCore_config.h @@ -0,0 +1,24 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:40 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..4ea457aacf --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SKX_Serial_KokkosCore_config.h @@ -0,0 +1,23 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:41 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_ENABLE_TM +#endif +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX512XEON 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..34c9537834 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:31 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..f7ed4d720c --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:32 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..126c29ba77 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:33 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..2f0216f9c4 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:34 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..5c68008bea --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:32 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..0278d0d079 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/SNB_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:33 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_AVX 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h new file mode 100644 index 0000000000..97389bb1bf --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_Cuda_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:28 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_CUDA 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h new file mode 100644 index 0000000000..dd5648f0c8 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_OpenMP_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:29 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_OPENMP 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h new file mode 100644 index 0000000000..c8a7adbd89 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_Pthread_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:30 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_PTHREAD 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h new file mode 100644 index 0000000000..d4a78790e3 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_Qthreads_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:31 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_QTHREADS 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h new file mode 100644 index 0000000000..712b5686f0 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_ROCm_KokkosCore_config.h @@ -0,0 +1,21 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:29 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_ENABLE_ROCM 1 +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h b/lib/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h new file mode 100644 index 0000000000..5bac7c2660 --- /dev/null +++ b/lib/kokkos/core/unit_test/config/results/WSM_Serial_KokkosCore_config.h @@ -0,0 +1,20 @@ +/* --------------------------------------------- +Makefile constructed configuration: +Fri Sep 22 17:22:30 MDT 2017 +----------------------------------------------*/ +#if !defined(KOKKOS_MACROS_HPP) || defined(KOKKOS_CORE_CONFIG_H) +#error "Do not include KokkosCore_config.h directly; include Kokkos_Macros.hpp instead." +#else +#define KOKKOS_CORE_CONFIG_H +#endif +/* Execution Spaces */ +#define KOKKOS_HAVE_SERIAL 1 +#ifndef __CUDA_ARCH__ +#define KOKKOS_USE_ISA_X86_64 +#endif +/* General Settings */ +#define KOKKOS_HAVE_CXX11 1 +#define KOKKOS_ENABLE_PROFILING +/* Optimization Settings */ +/* Cuda Settings */ +#define KOKKOS_ARCH_SSE42 1 diff --git a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp similarity index 63% rename from lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp rename to lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp index 57f8633bca..2c38bb0d91 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Rendezvous.hpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c13.cpp @@ -41,47 +41,14 @@ //@HEADER */ -#ifndef KOKKOS_IMPL_RENDEZVOUS_HPP -#define KOKKOS_IMPL_RENDEZVOUS_HPP +#include +#include -#include +namespace Test { -namespace Kokkos { namespace Impl { - -inline -constexpr int rendezvous_buffer_size( int max_members ) noexcept +TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset ) { - return (((max_members + 7) / 8) * 4) + 4 + 4; + TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >(); } -/** \brief Thread pool rendezvous - * - * Rendezvous pattern: - * if ( rendezvous(root) ) { - * ... only root thread here while all others wait ... - * rendezvous_release(); - * } - * else { - * ... all other threads release here ... - * } - * - * Requires: buffer[ rendezvous_buffer_size( max_threads ) ]; - * - * When slow != 0 the expectation is thread arrival will be - * slow so the threads that arrive early should quickly yield - * their core to the runtime thus possibly allowing the late - * arriving threads to have more resources - * (e.g., power and clock frequency). - */ -int rendezvous( volatile int64_t * const buffer - , int const size - , int const rank - , int const slow = 0 ) noexcept ; - -void rendezvous_release( volatile int64_t * const buffer ) noexcept ; - - -}} // namespace Kokkos::Impl - -#endif // KOKKOS_IMPL_RENDEZVOUS_HPP - +} // namespace Test diff --git a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp index a2158f06c7..1210307c76 100644 --- a/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp +++ b/lib/kokkos/core/unit_test/cuda/TestCuda_SubView_c_all.cpp @@ -10,3 +10,4 @@ #include #include #include +#include diff --git a/lib/kokkos/core/unit_test/diffconfig.sh b/lib/kokkos/core/unit_test/diffconfig.sh new file mode 100755 index 0000000000..0c8836ff83 --- /dev/null +++ b/lib/kokkos/core/unit_test/diffconfig.sh @@ -0,0 +1,18 @@ +#!/bin/bash +# CMake and Make tests run in separate directories +# The mapping of ARCH to #define is very complicated +# so diff is used instead of grepping +if test "`basename $PWD`" = "cmaketest"; then + outfile=$1 + resfile=../results/$1 +else + outfile=config/tmpstore/$1 + resfile=config/results/$1 +fi + +diff=`diff $outfile $resfile 2>&1 | grep -e define -e "such file"` +if test -z "$diff"; then + echo Passed +else + echo Failed: $diff +fi diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp new file mode 100644 index 0000000000..937bf69f19 --- /dev/null +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c13.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +namespace Test { + +TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset ) +{ + TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >(); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp index 399c6e92e4..5bf3626de9 100644 --- a/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp +++ b/lib/kokkos/core/unit_test/openmp/TestOpenMP_SubView_c_all.cpp @@ -10,3 +10,4 @@ #include #include #include +#include diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp new file mode 100644 index 0000000000..54897c922d --- /dev/null +++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c13.cpp @@ -0,0 +1,55 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +namespace Test { + +TEST_F( qthreads, view_test_unmanaged_subview_reset ) +{ +#if 0 + TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >(); +#endif +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp index ab984c5f30..d8ea958a5d 100644 --- a/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp +++ b/lib/kokkos/core/unit_test/qthreads/TestQthreads_SubView_c_all.cpp @@ -10,3 +10,4 @@ #include #include #include +#include diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp new file mode 100644 index 0000000000..b716dc8254 --- /dev/null +++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c13.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +namespace Test { + +TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset ) +{ + TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >(); +} + +} // namespace Test diff --git a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp index 24dc6b5061..53e8573ea8 100644 --- a/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp +++ b/lib/kokkos/core/unit_test/serial/TestSerial_SubView_c_all.cpp @@ -10,3 +10,4 @@ #include #include #include +#include diff --git a/lib/kokkos/core/unit_test/testmake.sh b/lib/kokkos/core/unit_test/testmake.sh new file mode 100755 index 0000000000..b5d4e8874d --- /dev/null +++ b/lib/kokkos/core/unit_test/testmake.sh @@ -0,0 +1,18 @@ +#!/bin/bash +if test "`basename $PWD`" = "cmaketest"; then + outfile=$1 +else + outfile=config/tmpstore/$1 +fi + +grep_arch=`grep KOKKOS_ARCH $outfile | grep $2 2>&1` +grep_devs=`grep KOKKOS_DEVICES $outfile | grep $3 2>&1` +if test -n "$grep_arch"; then + if test -n "$grep_devs"; then + echo Passed + else + echo Failed + fi +else + echo Failed +fi diff --git a/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp new file mode 100644 index 0000000000..1f9679d3ae --- /dev/null +++ b/lib/kokkos/core/unit_test/threads/TestThreads_SubView_c13.cpp @@ -0,0 +1,54 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +namespace Test { + +TEST_F( TEST_CATEGORY, view_test_unmanaged_subview_reset ) +{ + TestViewSubview::test_unmanaged_subview_reset< TEST_EXECSPACE >(); +} + +} // namespace Test diff --git a/lib/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf b/lib/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf new file mode 100644 index 0000000000..571ebff401 Binary files /dev/null and b/lib/kokkos/doc/SAND2017-10464-Kokkos-Task-DAG.pdf differ diff --git a/lib/kokkos/doc/develop_builds.md b/lib/kokkos/doc/develop_builds.md new file mode 100644 index 0000000000..9a211fa776 --- /dev/null +++ b/lib/kokkos/doc/develop_builds.md @@ -0,0 +1,76 @@ + +# Places to build options: architecture, device, advanced options, cuda options + +These are the files that need to be updated when a new architecture or device is +added: + + + generate_makefile.bash + * Interface for makefile system + + cmake/kokkos_options.cmake + * Interface for cmake system + + Makefile.kokkos + * Main logic for build (make and cmake) and defines (KokkosCore_config.h) + + core/unit_test/UnitTestConfig.make + * Unit test for Makefile.kokkos + +In general, an architecture is going to be from on of these platforms: + + AMD + + ARM + + IBM + + Intel + + Intel Xeon Phi + + NVIDIA +Although not strictly necessary, it is helpful to keep things organized by +grouping by platform. + +### generate_makefile.sh + +The bash code does not do any error checking on the `--arch=` or `--device=` +arguments thus strictly speaking you do not *need* to do anything to add a +device or architecture; however, you should add it to the help menu. For the +archictectures, please group by one of the platforms listed above. + + +### cmake/kokkos_options.cmake and cmake/kokkos_settings.cmake + +The options for the CMake build system are: `-DKOKKOS_HOST_ARCH:STRING=` and +`-DKOKKOS_ENABLE_:BOOL=`. Although any string can be passed into +KOKKOS_HOST_ARCH option, it is checked against an accepted list. Likewise, the +KOKKOS_ENABLE_ must have the option added AND it is formed using the +list. Thus: + + A new architecture should be added to the KOKKOS_HOST_ARCH_LIST variable. + + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a + KOKKOS_ENABLE_ option specified (see KOKKOS_ENABLE_CUDA for + example). + + A new device should be added to the KOKKOS_DEVICES_LIST variable **AND** a + +The translation from option to the `KOKKOS_SETTINGS` is done in +`kokkos_settings.cmake`. This translation is automated for some types if you ad +to the list, but for others, it may need to be hand coded. + + +### Makefile.kokkos + +This is the main coding used by both the make and cmake system for defining +the sources (generated makefile and cmake snippets by `core/src/Makefile`), for +setting the defines in KokkosCore_config.h, and defining various internal +variables. To understand how to add to this file, you should work closely with +the Kokkos development team. + + +### core/unit_test/UnitTestConfig.make + +This file is used to check the build system in a platform-independent way. It +works by looping over available architectures and devices; thus, you should add +your new architecure to KOKKOS_ARCH_OPTIONS and your new device to +KOKKOS_DEVICE_OPTIONS to be tested. The build system tests work by grepping the +generated build files (automatically). The header file tests work by diffing +the generated file with results that are stored in +`core/unit_tests/config/results` (namespaced by ARCH_DEVICE_). Thus, you will +need to add accepted results to this directory for diffing. + +The CMake build system is also tested in `core/unit_tests/config/cmaketest`. +Because it uses cmake/kokkos_options.cmake, it already has the tests to loop +over. It is diffed with the same files that the build system is tested with. +Thus, if you are consistent in all of the files listed, the unit tests should +pass automatically. diff --git a/lib/kokkos/example/cmake/Dependencies.cmake b/lib/kokkos/example/cmake/Dependencies.cmake index ca50a45c11..ed1ec4c725 100644 --- a/lib/kokkos/example/cmake/Dependencies.cmake +++ b/lib/kokkos/example/cmake/Dependencies.cmake @@ -1,4 +1,3 @@ TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_REQUIRED_PACKAGES KokkosCore KokkosContainers KokkosAlgorithms TEST_OPTIONAL_TPLS CUSPARSE MKL ) diff --git a/lib/kokkos/example/cmake_build/CMakeLists.txt b/lib/kokkos/example/cmake_build/CMakeLists.txt index f92c5c6513..8e1aa04727 100644 --- a/lib/kokkos/example/cmake_build/CMakeLists.txt +++ b/lib/kokkos/example/cmake_build/CMakeLists.txt @@ -32,15 +32,13 @@ # 4. make cmake_minimum_required(VERSION 3.1) -project(Example CXX C) +project(Example CXX C Fortran) -set(CMAKE_CXX_STANDARD 11) -set(CMAKE_CXX_EXTENSIONS OFF) list(APPEND CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} -O3) add_subdirectory(${Example_SOURCE_DIR}/../.. ${Example_BINARY_DIR}/kokkos) include_directories(${Kokkos_INCLUDE_DIRS_RET}) -add_executable(example cmake_example.cpp) +add_executable(example cmake_example.cpp foo.f) target_link_libraries(example kokkos) diff --git a/lib/kokkos/example/cmake_build/cmake_example.cpp b/lib/kokkos/example/cmake_build/cmake_example.cpp index 4786eeb859..bba4b7bd01 100644 --- a/lib/kokkos/example/cmake_build/cmake_example.cpp +++ b/lib/kokkos/example/cmake_build/cmake_example.cpp @@ -44,6 +44,8 @@ #include #include +extern "C" void print_fortran_(); + int main(int argc, char* argv[]) { Kokkos::initialize(argc, argv); Kokkos::DefaultExecutionSpace::print_configuration(std::cout); @@ -81,6 +83,8 @@ int main(int argc, char* argv[]) { count_time = timer.seconds(); printf("Sequential: %ld %10.6f\n", seq_count, count_time); + print_fortran_(); + Kokkos::finalize(); return (count == seq_count) ? 0 : -1; diff --git a/lib/kokkos/example/cmake_build/foo.f b/lib/kokkos/example/cmake_build/foo.f new file mode 100644 index 0000000000..e618455283 --- /dev/null +++ b/lib/kokkos/example/cmake_build/foo.f @@ -0,0 +1,4 @@ + FUNCTION print_fortran() + PRINT *, 'Hello World from Fortran' + RETURN + END diff --git a/lib/kokkos/example/feint/Makefile b/lib/kokkos/example/feint/Makefile index 9abf51d107..3f68c1c1cc 100644 --- a/lib/kokkos/example/feint/Makefile +++ b/lib/kokkos/example/feint/Makefile @@ -31,6 +31,10 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_EXAMPLE_FEINT += feint_cuda.o endif +ifeq ($(KOKKOS_INTERNAL_USE_ROCM), 1) + OBJ_EXAMPLE_FEINT += feint_rocm.o +endif + ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) OBJ_EXAMPLE_FEINT += feint_threads.o endif diff --git a/lib/kokkos/example/feint/feint_rocm.cpp b/lib/kokkos/example/feint/feint_rocm.cpp new file mode 100644 index 0000000000..2bb9e0ee15 --- /dev/null +++ b/lib/kokkos/example/feint/feint_rocm.cpp @@ -0,0 +1,67 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include + +#if defined( KOKKOS_ENABLE_ROCM ) + +#include + +namespace Kokkos { +namespace Example { + +template void feint( + const unsigned global_elem_nx , + const unsigned global_elem_ny , + const unsigned global_elem_nz ); + +template void feint( + const unsigned global_elem_nx , + const unsigned global_elem_ny , + const unsigned global_elem_nz ); + +} /* namespace Example */ +} /* namespace Kokkos */ + +#endif + diff --git a/lib/kokkos/example/fenl/fenl.cpp b/lib/kokkos/example/fenl/fenl.cpp index 5a6dc7e241..62a614019c 100644 --- a/lib/kokkos/example/fenl/fenl.cpp +++ b/lib/kokkos/example/fenl/fenl.cpp @@ -110,6 +110,27 @@ Perf fenl< Kokkos::Cuda , Kokkos::Example::BoxElemPart::ElemQuadratic >( #endif +#if defined( KOKKOS_ENABLE_ROCM ) + +template +Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( + MPI_Comm comm , + const int use_print , + const int use_trials , + const int use_atomic , + const int global_elems[] ); + + +template +Perf fenl< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemQuadratic >( + MPI_Comm comm , + const int use_print , + const int use_trials , + const int use_atomic , + const int global_elems[] ); + +#endif + } /* namespace FENL */ } /* namespace Example */ diff --git a/lib/kokkos/example/fenl/main.cpp b/lib/kokkos/example/fenl/main.cpp index 33b0049bc6..0df5a60b89 100644 --- a/lib/kokkos/example/fenl/main.cpp +++ b/lib/kokkos/example/fenl/main.cpp @@ -67,6 +67,7 @@ enum { CMD_USE_THREADS = 0 , CMD_USE_NUMA , CMD_USE_CORE_PER_NUMA , CMD_USE_CUDA + , CMD_USE_ROCM , CMD_USE_OPENMP , CMD_USE_CUDA_DEV , CMD_USE_FIXTURE_X @@ -114,6 +115,9 @@ void print_cmdline( std::ostream & s , const int cmd[] ) if ( cmd[ CMD_USE_CUDA ] ) { s << " CUDA(" << cmd[ CMD_USE_CUDA_DEV ] << ")" ; } + if ( cmd[ CMD_USE_ROCM ] ) { + s << " ROCM" ; + } if ( cmd[ CMD_USE_ATOMIC ] ) { s << " ATOMIC" ; } @@ -167,6 +171,7 @@ void run( MPI_Comm comm , const int cmd[] ) if ( cmd[ CMD_USE_THREADS ] ) { std::cout << "THREADS , " << cmd[ CMD_USE_THREADS ] ; } else if ( cmd[ CMD_USE_OPENMP ] ) { std::cout << "OPENMP , " << cmd[ CMD_USE_OPENMP ] ; } else if ( cmd[ CMD_USE_CUDA ] ) { std::cout << "CUDA" ; } + else if ( cmd[ CMD_USE_ROCM ] ) { std::cout << "ROCM" ; } if ( cmd[ CMD_USE_FIXTURE_QUADRATIC ] ) { std::cout << " , QUADRATIC-ELEMENT" ; } else { std::cout << " , LINEAR-ELEMENT" ; } @@ -288,6 +293,9 @@ int main( int argc , char ** argv ) cmdline[ CMD_USE_CUDA ] = 1 ; cmdline[ CMD_USE_CUDA_DEV ] = atoi( argv[++i] ) ; } + else if ( 0 == strcasecmp( argv[i] , "rocm" ) ) { + cmdline[ CMD_USE_ROCM ] = 1 ; + } else if ( 0 == strcasecmp( argv[i] , "fixture" ) ) { sscanf( argv[++i] , "%dx%dx%d" , cmdline + CMD_USE_FIXTURE_X , @@ -410,6 +418,21 @@ int main( int argc , char ** argv ) Kokkos::HostSpace::execution_space::finalize(); } +#endif + +#if defined( KOKKOS_ENABLE_ROCM ) + if ( cmdline[ CMD_USE_ROCM ] ) { + // Use the last device: + + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice( cmdline[ CMD_USE_ROCM ] ) ); + + run< Kokkos::Experimental::ROCm , Kokkos::Example::BoxElemPart::ElemLinear >( comm , cmdline ); + + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } + #endif } diff --git a/lib/kokkos/example/fixture/Main.cpp b/lib/kokkos/example/fixture/Main.cpp index 9b2275ad27..1f17531bd0 100644 --- a/lib/kokkos/example/fixture/Main.cpp +++ b/lib/kokkos/example/fixture/Main.cpp @@ -300,5 +300,16 @@ int main() Kokkos::HostSpace::execution_space::finalize(); } #endif + +#if defined( KOKKOS_ENABLE_ROCM ) + { + std::cout << "test_fixture< ROCm >" << std::endl ; + Kokkos::HostSpace::execution_space::initialize(); + Kokkos::Experimental::ROCm::initialize( Kokkos::Experimental::ROCm::SelectDevice(0) ); + Kokkos::Example::test_fixture< Kokkos::Experimental::ROCm >(); + Kokkos::Experimental::ROCm::finalize(); + Kokkos::HostSpace::execution_space::finalize(); + } +#endif } diff --git a/lib/kokkos/example/fixture/TestFixture.cpp b/lib/kokkos/example/fixture/TestFixture.cpp index 924cc39ce4..d47621b3cd 100644 --- a/lib/kokkos/example/fixture/TestFixture.cpp +++ b/lib/kokkos/example/fixture/TestFixture.cpp @@ -53,6 +53,10 @@ template void test_fixture< Kokkos::HostSpace::execution_space >(); template void test_fixture(); #endif +#if defined( KOKKOS_ENABLE_ROCM ) +template void test_fixture(); +#endif + } /* namespace Example */ } /* namespace Kokkos */ diff --git a/lib/kokkos/example/tutorial/01_hello_world/Makefile b/lib/kokkos/example/tutorial/01_hello_world/Makefile index 62ab22f17e..02a0fb10a0 100644 --- a/lib/kokkos/example/tutorial/01_hello_world/Makefile +++ b/lib/kokkos/example/tutorial/01_hello_world/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_hello_world.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,7 +18,7 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_hello_world.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" @@ -37,7 +37,7 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean rm -f *.o *.cuda *.host diff --git a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile index 52d5fb07c4..4fe3765c52 100644 --- a/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile +++ b/lib/kokkos/example/tutorial/01_hello_world_lambda/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_hello_world_lambda.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -19,7 +19,7 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_hello_world_lambda.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" @@ -38,7 +38,7 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean rm -f *.o *.cuda *.host diff --git a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile index d102af5151..bda28fbac0 100644 --- a/lib/kokkos/example/tutorial/02_simple_reduce/Makefile +++ b/lib/kokkos/example/tutorial/02_simple_reduce/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_simple_reduce.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,20 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_simple_reduce.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif DEPFLAGS = -M @@ -37,10 +46,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile index 4545668b77..a9542c6a43 100644 --- a/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile +++ b/lib/kokkos/example/tutorial/02_simple_reduce_lambda/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_simple_reduce_lambda.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -19,11 +19,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_simple_reduce_lambda.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_simple_reduce_lambda.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -38,10 +48,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/03_simple_view/Makefile b/lib/kokkos/example/tutorial/03_simple_view/Makefile index 32483a2555..de994a8df9 100644 --- a/lib/kokkos/example/tutorial/03_simple_view/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_simple_view.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_simple_view.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -38,10 +48,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile index b93c14910e..81910a4571 100644 --- a/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile +++ b/lib/kokkos/example/tutorial/03_simple_view_lambda/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_simple_view_lambda.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -19,11 +19,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_simple_view_lambda.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_simple_view_lambda.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -38,10 +48,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile index 8dd7598f03..0e84ac9c68 100644 --- a/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile +++ b/lib/kokkos/example/tutorial/04_simple_memoryspaces/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_simple_memoryspaces.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_simple_memoryspaces.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_simple_memoryspaces.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile index d297d45576..67fbd90c55 100644 --- a/lib/kokkos/example/tutorial/05_simple_atomics/Makefile +++ b/lib/kokkos/example/tutorial/05_simple_atomics/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 05_simple_atomics.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 05_simple_atomics.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 05_simple_atomics.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt new file mode 100644 index 0000000000..d18938a61f --- /dev/null +++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/CMakeLists.txt @@ -0,0 +1,10 @@ + +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) +INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) + +# This is a tutorial, not a test, so we don't ask CTest to run it. +TRIBITS_ADD_EXECUTABLE( + tutorial_06_simple_mdrangepolicy + SOURCES simple_mdrangepolicy.cpp + COMM serial mpi + ) diff --git a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile new file mode 100644 index 0000000000..7d3498ed17 --- /dev/null +++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/Makefile @@ -0,0 +1,48 @@ +KOKKOS_PATH = ../../.. +KOKKOS_SRC_PATH = ${KOKKOS_PATH} +SRC = $(wildcard ${KOKKOS_SRC_PATH}/example/tutorial/06_simple_mdrangepolicy/*.cpp) +vpath %.cpp $(sort $(dir $(SRC))) + +default: build + echo "Start Build" + +ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) +CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_simple_mdrangepolicy.cuda +KOKKOS_DEVICES = "Cuda,OpenMP" +KOKKOS_ARCH = "SNB,Kepler35" +else +CXX = g++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_simple_mdrangepolicy.host +KOKKOS_DEVICES = "OpenMP" +KOKKOS_ARCH = "SNB" +endif + +DEPFLAGS = -M + +OBJ = $(notdir $(SRC:.cpp=.o)) +LIB = + +include $(KOKKOS_PATH)/Makefile.kokkos + +build: $(EXE) + +test: $(EXE) + ./$(EXE) + +$(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + +clean: kokkos-clean + rm -f *.o *.cuda *.host + +# Compilation rules + +%.o:%.cpp $(KOKKOS_CPP_DEPENDS) + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) $(EXTRA_INC) -c $< -o $(notdir $@) diff --git a/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp new file mode 100644 index 0000000000..3712d6c5e2 --- /dev/null +++ b/lib/kokkos/example/tutorial/06_simple_mdrangepolicy/simple_mdrangepolicy.cpp @@ -0,0 +1,201 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#include +#include + +// +// MDRangePolicy example with parallel_for and parallel_reduce: +// 1. Start up Kokkos +// 2. Execute a parallel_for loop in the default execution space, +// using a functor to define the loop body +// 3. Shut down Kokkos +// +// Two examples are provided: +// Example 1: Rank 2 case with minimal default parameters and arguments used +// in the MDRangePolicy +// +// Example 2: Rank 3 case with additional outer/inner iterate pattern parameters +// and tile dims passed to the ctor + + +// Simple functor for computing/storing the product of indices in a View v +template < class ViewType > +struct MDFunctor { + + typedef long value_type; + + ViewType v; + size_t size; + + MDFunctor( const ViewType & v_, const size_t size_ ) + : v(v_), size(size_) {} + + // 2D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator () (const int i, const int j) const { + v(i,j) = i*j; // compute the product of indices + } + + // 3D case - used by parallel_for + KOKKOS_INLINE_FUNCTION + void operator () (const int i, const int j, const int k) const { + v(i,j,k) = i*j*k; // compute the product of indices + } + + // 2D case - reduction + KOKKOS_INLINE_FUNCTION + void operator () (const int i, const int j, value_type & incorrect_count) const { + if ( v(i,j) != i*j ) { + incorrect_count += 1; + } + } + + // 3D case - reduction + KOKKOS_INLINE_FUNCTION + void operator () (const int i, const int j, const int k, value_type & incorrect_count) const { + if ( v(i,j,k) != i*j*k ) { + incorrect_count += 1; + } + } + +}; + +int main (int argc, char* argv[]) { + Kokkos::initialize (argc, argv); + + // Bound(s) for MDRangePolicy + const int n = 100; + + // ViewType typedefs for Rank<2>, Rank<3> for example usage + typedef double ScalarType; + typedef typename Kokkos::View ViewType_2D; + typedef typename Kokkos::View ViewType_3D; + + ///////////////////////////////////////////////////////////////////////////// + // Explanation of MDRangePolicy usage, template parameters, constructor arguments + // + // MDRangePolicy typedefs for Rank<2>, Rank<3> cases + // Required template parameters: + // Kokkos::Rank: where N=rank + // + // Optional template parameters to Rank<...>: + // Kokkos::Iterate::{Default,Left,Right}: Outer iteration pattern across tiles; + // defaults based on the execution space similar to Kokkos::Layout + // Kokkos::Iterate::{Default,Left,Right}: Inner iteration pattern within tiles; + // defaults based on the execution space similar to Kokkos::Layout + // + // e.g. typedef Rank<2, Iterate::Left, Iterate::Left> rank2ll; + // + // + // Optional template parameters to MDRangePolicy: + // ExecutionSpace: Kokkos::Serial, Kokkos::OpenMP, Kokkos::Cuda, etc. + // + // Kokkos::IndexType< T >: where T = int, long, unsigned int, etc. + // + // struct Tag{}: A user-provided tag for tagging functor operators + // + // e.g. 1: MDRangePolicy< Kokkos::Serial, Rank<2, Iterate::Left, Iterate::Left>, IndexType, Tag > mdpolicy; + // e.g. 2: MDRangePolicy< Kokkos::Serial, rank2ll, IndexType, Tag > mdpolicy; + // + // + // Required arguments to ctor: + // {{ l0, l1, ... }}: Lower bounds, provided as Kokkos::Array or std::initializer_list + // {{ u0, u1, ... }}: Upper bounds, provided as Kokkos::Array or std::initializer_list + // + // Optional arguments to ctor: + // {{ t0, t1, ... }}: Tile dimensions, provided as Kokkos::Array or std::initializer_list + // defaults based on the execution space + // + // e.g. mdpolicy( {{0,0}}, {{u0,u1}}, {{t0,t1}}; + // + ///////////////////////////////////////////////////////////////////////////// + + // Example 1: + long incorrect_count_2d = 0; + { + // Rank<2> Case: Rank is provided, all other parameters are default + typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<2> > MDPolicyType_2D; + + // Construct 2D MDRangePolicy: lower and upper bounds provided, tile dims defaulted + MDPolicyType_2D mdpolicy_2d( {{0,0}}, {{n,n}} ); + + // Construct a 2D view to store result of product of indices + ViewType_2D v2("v2", n, n); + + // Execute parallel_for with rank 2 MDRangePolicy + Kokkos::parallel_for( "md2d", mdpolicy_2d, MDFunctor(v2, n) ); + + // Check results with a parallel_reduce using the MDRangePolicy + Kokkos::parallel_reduce( "md2dredux", mdpolicy_2d, MDFunctor(v2, n), incorrect_count_2d ); + + printf("Rank 2 MDRangePolicy incorrect count: %ld\n", incorrect_count_2d); // should be 0 + } + + + // Example 2: + long incorrect_count_3d = 0; + { + // Rank<3> Case: Rank, inner iterate pattern, outer iterate pattern provided + typedef typename Kokkos::Experimental::MDRangePolicy< Kokkos::Experimental::Rank<3, Kokkos::Experimental::Iterate::Left, Kokkos::Experimental::Iterate::Left> > MDPolicyType_3D; + + // Construct 3D MDRangePolicy: lower, upper bounds, tile dims provided + MDPolicyType_3D mdpolicy_3d( {{0,0,0}}, {{n,n,n}}, {{4,4,4}} ); + + // Construct a 3D view to store result of product of indices + ViewType_3D v3("v3", n, n, n); + + // Execute parallel_for with rank 3 MDRangePolicy + Kokkos::parallel_for( "md3d", mdpolicy_3d, MDFunctor(v3, n) ); + + // Check results with a parallel_reduce using the MDRangePolicy + Kokkos::parallel_reduce( "md3dredux", mdpolicy_3d, MDFunctor(v3, n), incorrect_count_3d ); + + printf("Rank 3 MDRangePolicy incorrect count: %ld\n", incorrect_count_3d); // should be 0 + } + + Kokkos::finalize (); + + return (incorrect_count_2d == long(0) && incorrect_count_3d == long(0)) ? 0 : -1; +} + diff --git a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile index 956a4d1798..94ace811f3 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/01_data_layouts/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_data_layouts.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_data_layouts.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_data_layouts.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile index 41697b0731..f64ee3540e 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/02_memory_traits/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_memory_traits.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_memory_traits.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_memory_traits.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile index 8d0697aa21..ad70ee02d1 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/03_subviews/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_subviews.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_subviews.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_subviews.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile index 0a3acd984f..e08be5c1df 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/04_dualviews/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_dualviews.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_dualviews.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_dualviews.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile index 615ee2887a..ffd8184304 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/05_NVIDIA_UVM/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 05_NVIDIA_UVM.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,7 +18,7 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 05_NVIDIA_UVM.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" @@ -37,7 +37,7 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean rm -f *.o *.cuda *.host diff --git a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile index dfb7d6df64..725d0de0e2 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/06_AtomicViews/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 06_AtomicViews.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 06_AtomicViews.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 06_AtomicViews.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile index 432a90126d..8983b46d60 100644 --- a/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile +++ b/lib/kokkos/example/tutorial/Advanced_Views/07_Overlapping_DeepCopy/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 --default-stream per-thread LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 07_Overlapping_DeepCopy.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,7 +18,7 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 07_Overlapping_DeepCopy.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" @@ -37,7 +37,7 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean rm -f *.o *.cuda *.host diff --git a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile index 60f6f94cdf..386a87474d 100644 --- a/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile +++ b/lib/kokkos/example/tutorial/Algorithms/01_random_numbers/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_random_numbers.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_random_numbers.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_random_numbers.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/CMakeLists.txt b/lib/kokkos/example/tutorial/CMakeLists.txt index d1fd4c0ae9..613e460cad 100644 --- a/lib/kokkos/example/tutorial/CMakeLists.txt +++ b/lib/kokkos/example/tutorial/CMakeLists.txt @@ -4,6 +4,7 @@ TRIBITS_ADD_EXAMPLE_DIRECTORIES(02_simple_reduce) TRIBITS_ADD_EXAMPLE_DIRECTORIES(03_simple_view) TRIBITS_ADD_EXAMPLE_DIRECTORIES(04_simple_memoryspaces) TRIBITS_ADD_EXAMPLE_DIRECTORIES(05_simple_atomics) +TRIBITS_ADD_EXAMPLE_DIRECTORIES(06_simple_mdrangepolicy) TRIBITS_ADD_EXAMPLE_DIRECTORIES(Advanced_Views) TRIBITS_ADD_EXAMPLE_DIRECTORIES(Hierarchical_Parallelism) diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile index 8c50430c30..7282abc30c 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_thread_teams.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_thread_teams.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile index b9b017bf1b..4049dbde34 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/01_thread_teams_lambda/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_thread_teams_lambda.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -19,11 +19,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 01_thread_teams_lambda.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 01_thread_teams_lambda.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -38,10 +48,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile index bae9351229..fe882f36b8 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/02_nested_parallel_for/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_nested_parallel_for.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 02_nested_parallel_for.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 02_nested_parallel_for.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile index a041b69b56..4481889cdb 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/03_vectorization/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_vectorization.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 03_vectorization.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 03_vectorization.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile index 6418875c9e..0f0bcf70de 100644 --- a/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile +++ b/lib/kokkos/example/tutorial/Hierarchical_Parallelism/04_team_scan/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_team_scan.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = 04_team_scan.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = 04_team_scan.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + DEPFLAGS = -M @@ -37,10 +47,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.host *.rocm # Compilation rules diff --git a/lib/kokkos/example/tutorial/launch_bounds/Makefile b/lib/kokkos/example/tutorial/launch_bounds/Makefile index 5b605a4119..4a1bf17344 100644 --- a/lib/kokkos/example/tutorial/launch_bounds/Makefile +++ b/lib/kokkos/example/tutorial/launch_bounds/Makefile @@ -10,7 +10,7 @@ ifneq (,$(findstring Cuda,$(KOKKOS_DEVICES))) CXX = ${KOKKOS_PATH}/bin/nvcc_wrapper CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = launch_bounds.cuda KOKKOS_DEVICES = "Cuda,OpenMP" KOKKOS_ARCH = "SNB,Kepler35" @@ -18,11 +18,21 @@ else CXX = g++ CXXFLAGS = -O3 LINK = ${CXX} -LINKFLAGS = +LDFLAGS = EXE = launch_bounds.host KOKKOS_DEVICES = "OpenMP" KOKKOS_ARCH = "SNB" endif +ifneq (,$(findstring ROCm,$(KOKKOS_DEVICES))) +CXX = /opt/rocm/hcc/bin/clang++ +CXXFLAGS = -O3 +LINK = ${CXX} +LDFLAGS = +EXE = launch_bounds.rocm +KOKKOS_DEVICES = "ROCm" +KOKKOS_ARCH = "Fiji" +endif + # WAR for "undefined memcpy" w/ Ubuntu + CUDA 7.5 CXXFLAGS += -D_FORCE_INLINES @@ -45,10 +55,10 @@ test: $(EXE) ./$(EXE) $(EXE): $(OBJ) $(KOKKOS_LINK_DEPENDS) - $(LINK) $(KOKKOS_LDFLAGS) $(LINKFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) + $(LINK) $(KOKKOS_LDFLAGS) $(LDFLAGS) $(EXTRA_PATH) $(OBJ) $(KOKKOS_LIBS) $(LIB) -o $(EXE) clean: kokkos-clean - rm -f *.o *.cuda *.host + rm -f *.o *.cuda *.rocm # Compilation rules diff --git a/lib/kokkos/generate_makefile.bash b/lib/kokkos/generate_makefile.bash index b4a69d30fd..521a77d1af 100755 --- a/lib/kokkos/generate_makefile.bash +++ b/lib/kokkos/generate_makefile.bash @@ -123,6 +123,7 @@ do echo " ARMv81 = ARMv8.1 Compatible CPU" echo " ARMv8-ThunderX = ARMv8 Cavium ThunderX CPU" echo " [IBM]" + echo " BGQ = IBM Blue Gene Q" echo " Power7 = IBM POWER7 and POWER7+ CPUs" echo " Power8 = IBM POWER8 CPUs" echo " Power9 = IBM POWER9 CPUs" @@ -271,9 +272,10 @@ else fi mkdir -p install -echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/Makefile.kokkos -echo "kokkos-clean:" >> install/Makefile.kokkos -echo "" >> install/Makefile.kokkos +gen_makefile=Makefile.kokkos +echo "#Makefile to satisfy existens of target kokkos-clean before installing the library" > install/${gen_makefile} +echo "kokkos-clean:" >> install/${gen_makefile} +echo "" >> install/${gen_makefile} mkdir -p core mkdir -p core/unit_test mkdir -p core/perf_test diff --git a/src/.gitignore b/src/.gitignore index fe23bc1f55..a9dc1c8fa1 100644 --- a/src/.gitignore +++ b/src/.gitignore @@ -547,6 +547,7 @@ /fix_shake.h /fix_shardlow.cpp /fix_shardlow.h +/random_external_state.h /fix_smd.cpp /fix_smd.h /fix_species.cpp diff --git a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp index b5aadb18d6..67dce07b80 100644 --- a/src/KOKKOS/atom_vec_hybrid_kokkos.cpp +++ b/src/KOKKOS/atom_vec_hybrid_kokkos.cpp @@ -255,6 +255,7 @@ int AtomVecHybridKokkos::pack_comm_kokkos(const int &n, const DAT::tdual_int_2d const int &pbc_flag, const int pbc[]) { error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm"); + return 0; } void AtomVecHybridKokkos::unpack_comm_kokkos(const int &n, const int &nfirst, const DAT::tdual_xfloat_2d &buf) @@ -266,12 +267,14 @@ int AtomVecHybridKokkos::pack_comm_self(const int &n, const DAT::tdual_int_2d &l const int &pbc_flag, const int pbc[]) { error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm"); + return 0; } int AtomVecHybridKokkos::pack_border_kokkos(int n, DAT::tdual_int_2d k_sendlist, DAT::tdual_xfloat_2d buf,int iswap, int pbc_flag, int *pbc, ExecutionSpace space) { error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm"); + return 0; } void AtomVecHybridKokkos::unpack_border_kokkos(const int &n, const int &nfirst, const DAT::tdual_xfloat_2d &buf, @@ -286,12 +289,14 @@ int AtomVecHybridKokkos::pack_exchange_kokkos(const int &nsend,DAT::tdual_xfloat X_FLOAT lo, X_FLOAT hi) { error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm"); + return 0; } int AtomVecHybridKokkos::unpack_exchange_kokkos(DAT::tdual_xfloat_2d &k_buf, int nrecv, int nlocal, int dim, X_FLOAT lo, X_FLOAT hi, ExecutionSpace space) { error->all(FLERR,"AtomVecHybridKokkos doesn't yet support threaded comm"); + return 0; } /* ---------------------------------------------------------------------- */ diff --git a/src/KOKKOS/comm_kokkos.cpp b/src/KOKKOS/comm_kokkos.cpp index e506fa1ad4..de35431eb9 100644 --- a/src/KOKKOS/comm_kokkos.cpp +++ b/src/KOKKOS/comm_kokkos.cpp @@ -200,6 +200,7 @@ void CommKokkos::forward_comm_device(int dummy) } n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); if (n) { MPI_Send(k_buf_send.view().ptr_on_device(), n,MPI_DOUBLE,sendproc[iswap],0,world); @@ -229,11 +230,13 @@ void CommKokkos::forward_comm_device(int dummy) recvproc[iswap],0,world,&request); n = avec->pack_comm_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_send,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); if (n) MPI_Send(k_buf_send.view().ptr_on_device(),n, MPI_DOUBLE,sendproc[iswap],0,world); if (size_forward_recv[iswap]) MPI_Wait(&request,MPI_STATUS_IGNORE); avec->unpack_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv); + DeviceType::fence(); } } else { @@ -321,6 +324,7 @@ void CommKokkos::reverse_comm_device() size_reverse_recv[iswap],MPI_DOUBLE, sendproc[iswap],0,world,&request); n = avec->pack_reverse_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_send); + DeviceType::fence(); if (n) MPI_Send(k_buf_send.view().ptr_on_device(),n, MPI_DOUBLE,recvproc[iswap],0,world); @@ -328,6 +332,7 @@ void CommKokkos::reverse_comm_device() } avec->unpack_reverse_kokkos(sendnum[iswap],k_sendlist,iswap, k_buf_recv); + DeviceType::fence(); } else { if (sendnum[iswap]) n = avec->unpack_reverse_self(sendnum[iswap],k_sendlist,iswap, @@ -395,6 +400,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair) n = pairKKBase->pack_forward_comm_kokkos(sendnum[iswap],k_sendlist, iswap,k_buf_send_pair,pbc_flag[iswap],pbc[iswap]); + DeviceType::fence(); // exchange with another proc // if self, set recv buffer to send buffer @@ -411,6 +417,7 @@ void CommKokkos::forward_comm_pair_device(Pair *pair) // unpack buffer pairKKBase->unpack_forward_comm_kokkos(recvnum[iswap],firstrecv[iswap],k_buf_recv_pair); + DeviceType::fence(); } } @@ -610,6 +617,7 @@ void CommKokkos::exchange_device() k_exchange_sendlist,k_exchange_copylist, ExecutionSpaceFromDevice:: space,dim,lo,hi); + DeviceType::fence(); } else { while (i < nlocal) { @@ -634,6 +642,7 @@ void CommKokkos::exchange_device() atom->nlocal=avec-> unpack_exchange_kokkos(k_buf_send,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice::space); + DeviceType::fence(); } } else { MPI_Sendrecv(&nsend,1,MPI_INT,procneigh[dim][0],0, @@ -666,6 +675,7 @@ void CommKokkos::exchange_device() atom->nlocal = avec-> unpack_exchange_kokkos(k_buf_recv,nrecv,atom->nlocal,dim,lo,hi, ExecutionSpaceFromDevice::space); + DeviceType::fence(); } } @@ -926,10 +936,12 @@ void CommKokkos::borders_device() { n = avec->pack_border_vel(nsend,sendlist[iswap],buf_send, pbc_flag[iswap],pbc[iswap]); } - else + else { n = avec-> pack_border_kokkos(nsend,k_sendlist,k_buf_send,iswap, pbc_flag[iswap],pbc[iswap],exec_space); + DeviceType::fence(); + } // swap atoms with other proc // no MPI calls except SendRecv if nsend/nrecv = 0 @@ -960,12 +972,15 @@ void CommKokkos::borders_device() { avec->unpack_border_vel(nrecv,atom->nlocal+atom->nghost,buf); } else - if (sendproc[iswap] != me) + if (sendproc[iswap] != me) { avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_recv,exec_space); - else + DeviceType::fence(); + } else { avec->unpack_border_kokkos(nrecv,atom->nlocal+atom->nghost, k_buf_send,exec_space); + DeviceType::fence(); + } // set all pointers & counters diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.cpp b/src/KOKKOS/fix_qeq_reax_kokkos.cpp index aefc2ea130..0f578707a4 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.cpp +++ b/src/KOKKOS/fix_qeq_reax_kokkos.cpp @@ -64,6 +64,10 @@ FixQEqReaxKokkos(LAMMPS *lmp, int narg, char **arg) : nmax = nmax = m_cap = 0; allocated_flag = 0; nprev = 4; + + memory->destroy(s_hist); + memory->destroy(t_hist); + grow_arrays(atom->nmax); } /* ---------------------------------------------------------------------- */ @@ -72,6 +76,9 @@ template FixQEqReaxKokkos::~FixQEqReaxKokkos() { if (copymode) return; + + memoryKK->destroy_kokkos(k_s_hist,s_hist); + memoryKK->destroy_kokkos(k_t_hist,t_hist); } /* ---------------------------------------------------------------------- */ @@ -157,25 +164,11 @@ void FixQEqReaxKokkos::init_shielding_k() template void FixQEqReaxKokkos::init_hist() { - int i,j; - - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",atom->nmax,nprev); - d_s_hist = k_s_hist.template view(); - h_s_hist = k_s_hist.h_view; - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",atom->nmax,nprev); - d_t_hist = k_t_hist.template view(); - h_t_hist = k_t_hist.h_view; - - for( i = 0; i < atom->nmax; i++ ) - for( j = 0; j < nprev; j++ ) - k_s_hist.h_view(i,j) = k_t_hist.h_view(i,j) = 0.0; - - k_s_hist.template modify(); - k_s_hist.template sync(); - - k_t_hist.template modify(); - k_t_hist.template sync(); + Kokkos::deep_copy(d_s_hist,0.0); + Kokkos::deep_copy(d_t_hist,0.0); + k_s_hist.template modify(); + k_t_hist.template modify(); } /* ---------------------------------------------------------------------- */ @@ -235,6 +228,8 @@ void FixQEqReaxKokkos::pre_force(int vflag) Kokkos::parallel_scan(inum,computeH_functor); // init_matvec + k_s_hist.template sync(); + k_t_hist.template sync(); FixQEqReaxKokkosMatVecFunctor matvec_functor(this); Kokkos::parallel_for(inum,matvec_functor); @@ -268,6 +263,8 @@ void FixQEqReaxKokkos::pre_force(int vflag) // calculate_Q(); calculate_q(); + k_s_hist.template modify(); + k_t_hist.template modify(); copymode = 0; @@ -340,14 +337,6 @@ void FixQEqReaxKokkos::allocate_array() k_d = DAT::tdual_ffloat_1d("qeq/kk:h_d",nmax); d_d = k_d.template view(); h_d = k_d.h_view; - - k_s_hist = DAT::tdual_ffloat_2d("qeq/kk:s_hist",nmax,nprev); - d_s_hist = k_s_hist.template view(); - h_s_hist = k_s_hist.h_view; - - k_t_hist = DAT::tdual_ffloat_2d("qeq/kk:t_hist",nmax,nprev); - d_t_hist = k_t_hist.template view(); - h_t_hist = k_t_hist.h_view; } // init_storage @@ -375,8 +364,6 @@ void FixQEqReaxKokkos::zero_item(int ii) const d_o[i] = 0.0; d_r[i] = 0.0; d_d[i] = 0.0; - //for( int j = 0; j < nprev; j++ ) - //d_s_hist(i,j) = d_t_hist(i,j) = 0.0; } } @@ -468,7 +455,7 @@ double FixQEqReaxKokkos::calculate_H_k(const F_FLOAT &r, const F_FLO template KOKKOS_INLINE_FUNCTION -void FixQEqReaxKokkos::mat_vec_item(int ii) const +void FixQEqReaxKokkos::matvec_item(int ii) const { const int i = d_ilist[ii]; const int itype = type(i); @@ -1175,7 +1162,77 @@ double FixQEqReaxKokkos::memory_usage() return bytes; } -/* ---------------------------------------------------------------------- */\ +/* ---------------------------------------------------------------------- + allocate fictitious charge arrays +------------------------------------------------------------------------- */ + +template +void FixQEqReaxKokkos::grow_arrays(int nmax) +{ + k_s_hist.template sync(); // force reallocation on host + k_t_hist.template sync(); + + memoryKK->grow_kokkos(k_s_hist,s_hist,nmax,nprev,"qeq:s_hist"); + memoryKK->grow_kokkos(k_t_hist,t_hist,nmax,nprev,"qeq:t_hist"); + + d_s_hist = k_s_hist.template view(); + d_t_hist = k_t_hist.template view(); + + k_s_hist.template modify(); + k_t_hist.template modify(); +} + +/* ---------------------------------------------------------------------- + copy values within fictitious charge arrays +------------------------------------------------------------------------- */ + +template +void FixQEqReaxKokkos::copy_arrays(int i, int j, int delflag) +{ + k_s_hist.template sync(); + k_t_hist.template sync(); + + for (int m = 0; m < nprev; m++) { + s_hist[j][m] = s_hist[i][m]; + t_hist[j][m] = t_hist[i][m]; + } + + k_s_hist.template modify(); + k_t_hist.template modify(); +} + +/* ---------------------------------------------------------------------- + pack values in local atom-based array for exchange with another proc +------------------------------------------------------------------------- */ + +template +int FixQEqReaxKokkos::pack_exchange(int i, double *buf) +{ + k_s_hist.template sync(); + k_t_hist.template sync(); + + for (int m = 0; m < nprev; m++) buf[m] = s_hist[i][m]; + for (int m = 0; m < nprev; m++) buf[nprev+m] = t_hist[i][m]; + return nprev*2; +} + +/* ---------------------------------------------------------------------- + unpack values in local atom-based array from exchange with another proc +------------------------------------------------------------------------- */ + +template +int FixQEqReaxKokkos::unpack_exchange(int nlocal, double *buf) +{ + for (int m = 0; m < nprev; m++) s_hist[nlocal][m] = buf[m]; + for (int m = 0; m < nprev; m++) t_hist[nlocal][m] = buf[nprev+m]; + + k_s_hist.template modify(); + k_t_hist.template modify(); + + return nprev*2; +} + +/* ---------------------------------------------------------------------- */ namespace LAMMPS_NS { template class FixQEqReaxKokkos; diff --git a/src/KOKKOS/fix_qeq_reax_kokkos.h b/src/KOKKOS/fix_qeq_reax_kokkos.h index 64f81b9141..27be712ad8 100644 --- a/src/KOKKOS/fix_qeq_reax_kokkos.h +++ b/src/KOKKOS/fix_qeq_reax_kokkos.h @@ -58,7 +58,7 @@ class FixQEqReaxKokkos : public FixQEqReax { void compute_h_item(int, int &, const bool &) const; KOKKOS_INLINE_FUNCTION - void mat_vec_item(int) const; + void matvec_item(int) const; KOKKOS_INLINE_FUNCTION void sparse12_item(int) const; @@ -146,7 +146,7 @@ class FixQEqReaxKokkos : public FixQEqReax { void unpack_reverse_comm(int, int *, double *); double memory_usage(); - protected: + private: int inum; int allocated_flag; @@ -213,6 +213,10 @@ class FixQEqReaxKokkos : public FixQEqReax { typename AT::t_int_2d d_sendlist; typename AT::t_xfloat_1d_um v_buf; + void grow_arrays(int); + void copy_arrays(int, int, int); + int pack_exchange(int, double *); + int unpack_exchange(int, double *); }; template @@ -238,7 +242,7 @@ struct FixQEqReaxKokkosMatVecFunctor { }; KOKKOS_INLINE_FUNCTION void operator()(const int ii) const { - c.mat_vec_item(ii); + c.matvec_item(ii); } }; diff --git a/src/KOKKOS/fix_shardlow_kokkos.cpp b/src/KOKKOS/fix_shardlow_kokkos.cpp index cc1bd6bede..571f488023 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.cpp +++ b/src/KOKKOS/fix_shardlow_kokkos.cpp @@ -61,6 +61,7 @@ using namespace LAMMPS_NS; using namespace FixConst; +using namespace random_external_state; #define EPSILON 1.0e-10 #define EPSILON_SQUARED ((EPSILON) * (EPSILON)) @@ -89,10 +90,6 @@ FixShardlowKokkos::FixShardlowKokkos(LAMMPS *lmp, int narg, char **a // if(k_pairDPDE){ comm_forward = 3; comm_reverse = 5; - maxRNG = 0; -#ifdef DPD_USE_RAN_MARS - pp_random = NULL; -#endif // } else { // comm_forward = 3; // comm_reverse = 3; @@ -121,13 +118,6 @@ template FixShardlowKokkos::~FixShardlowKokkos() { ghostmax = 0; -#ifdef DPD_USE_RAN_MARS - if (pp_random) { - for (int i = 1; i < maxRNG; ++i) delete pp_random[i]; - delete[] pp_random; - pp_random = NULL; - } -#endif } /* ---------------------------------------------------------------------- */ @@ -279,11 +269,7 @@ void FixShardlowKokkos::ssa_update_dpd( int start_ii, int count, int id ) { -#ifdef DPD_USE_RAN_MARS - class RanMars *pRNG = pp_random[id]; -#else - rand_type rand_gen = rand_pool.get_state(id); -#endif + es_RNG_t RNGstate = d_rand_state(id); int ct = count; int ii = start_ii; @@ -345,12 +331,7 @@ void FixShardlowKokkos::ssa_update_dpd( double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma; double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv; - double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * -#ifdef DPD_USE_RAN_MARS - pRNG->gaussian(); -#else - rand_gen.normal(); -#endif + double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * es_normal(RNGstate); const double mass_j = masses(massPerI ? j : jtype); double massinv_j = 1.0 / mass_j; @@ -412,9 +393,7 @@ void FixShardlowKokkos::ssa_update_dpd( v(i, 2) = vzi; } -#ifndef DPD_USE_RAN_MARS - rand_pool.free_state(rand_gen); -#endif + d_rand_state(id) = RNGstate; } #endif @@ -431,11 +410,7 @@ void FixShardlowKokkos::ssa_update_dpde( int start_ii, int count, int id ) const { -#ifdef DPD_USE_RAN_MARS - class RanMars *pRNG = pp_random[id]; -#else - rand_type rand_gen = rand_pool.get_state(id); -#endif + es_RNG_t RNGstate = d_rand_state(id); int ct = count; int ii = start_ii; @@ -506,12 +481,7 @@ void FixShardlowKokkos::ssa_update_dpde( double halfsigma_ij = STACKPARAMS?m_params[itype][jtype].halfsigma:params(itype,jtype).halfsigma; double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv; - double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * -#ifdef DPD_USE_RAN_MARS - pRNG->gaussian(); -#else - rand_gen.normal(); -#endif + double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * es_normal(RNGstate); const double mass_j = masses(massPerI ? j : jtype); double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v; @@ -520,12 +490,7 @@ void FixShardlowKokkos::ssa_update_dpde( // Compute uCond double kappa_ij = STACKPARAMS?m_params[itype][jtype].kappa:params(itype,jtype).kappa; double alpha_ij = STACKPARAMS?m_params[itype][jtype].alpha:params(itype,jtype).alpha; - double del_uCond = alpha_ij*wr*dtsqrt * -#ifdef DPD_USE_RAN_MARS - pRNG->gaussian(); -#else - rand_gen.normal(); -#endif + double del_uCond = alpha_ij*wr*dtsqrt * es_normal(RNGstate); del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt; uCond[j] -= del_uCond; @@ -601,9 +566,7 @@ void FixShardlowKokkos::ssa_update_dpde( ii++; } -#ifndef DPD_USE_RAN_MARS - rand_pool.free_state(rand_gen); -#endif + d_rand_state(id) = RNGstate; } @@ -644,20 +607,16 @@ void FixShardlowKokkos::initial_integrate(int vflag) maxWorkItemCt = (int) ssa_gitemLoc.dimension_1(); } if (maxWorkItemCt > maxRNG) { -#ifdef DPD_USE_RAN_MARS - if (pp_random) { - for (int i = 1; i < maxRNG; ++i) delete pp_random[i]; - delete[] pp_random; - pp_random = NULL; + es_RNG_t serial_rand_state; + es_init(serial_rand_state, pairDPDE->seed + comm->me); + + d_rand_state = es_RNGs_type("Kokkos::fix_shardlow::rand_state",maxWorkItemCt); + typename es_RNGs_type::HostMirror h_rand_state = create_mirror_view(d_rand_state); + for (int i = 0; i < maxWorkItemCt; ++i) { + es_genNextParallelState(serial_rand_state, h_rand_state(i)); } - pp_random = new RanMars*[maxWorkItemCt]; - for (int i = 1; i < maxWorkItemCt; ++i) { - pp_random[i] = new RanMars(lmp, k_pairDPDE->seed + comm->me + comm->nprocs*i); - } - pp_random[0] = k_pairDPDE->random; -#else - rand_pool.init(k_pairDPDE->seed + comm->me, maxWorkItemCt); -#endif + deep_copy(d_rand_state,h_rand_state); + maxRNG = maxWorkItemCt; } diff --git a/src/KOKKOS/fix_shardlow_kokkos.h b/src/KOKKOS/fix_shardlow_kokkos.h index 70dccf2e2d..4e87de6910 100644 --- a/src/KOKKOS/fix_shardlow_kokkos.h +++ b/src/KOKKOS/fix_shardlow_kokkos.h @@ -93,17 +93,6 @@ class FixShardlowKokkos : public FixShardlow { #endif PairDPDfdtEnergyKokkos *k_pairDPDE; - int maxRNG; -#ifdef DPD_USE_RAN_MARS - class RanMars **pp_random; -#elif defined(DPD_USE_Random_XorShift1024) - Kokkos::Random_XorShift1024_Pool rand_pool; - typedef typename Kokkos::Random_XorShift1024_Pool::generator_type rand_type; -#else - Kokkos::Random_XorShift64_Pool rand_pool; - typedef typename Kokkos::Random_XorShift64_Pool::generator_type rand_type; -#endif - Kokkos::DualView k_params; typename Kokkos::DualView::t_dev_const_um params; @@ -127,6 +116,10 @@ class FixShardlowKokkos : public FixShardlow { typename AT::t_float_1d_randomread masses; typename AT::t_efloat_1d dpdTheta; + // Storage for the es_RNG state variables + typedef Kokkos::View es_RNGs_type; + es_RNGs_type d_rand_state; + double dtsqrt; // = sqrt(update->dt); int ghostmax; int nlocal, nghost; diff --git a/src/KOKKOS/gridcomm_kokkos.cpp b/src/KOKKOS/gridcomm_kokkos.cpp index fdfaf296ef..f107370514 100644 --- a/src/KOKKOS/gridcomm_kokkos.cpp +++ b/src/KOKKOS/gridcomm_kokkos.cpp @@ -523,6 +523,7 @@ void GridCommKokkos::forward_comm(KSpace *kspace, int which) kspaceKKBase->pack_forward_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); else kspaceKKBase->pack_forward_kspace_kokkos(which,k_buf1,swap[m].npack,k_packlist,m); + DeviceType::fence(); if (swap[m].sendproc != me) { MPI_Irecv(k_buf2.view().ptr_on_device(),nforward*swap[m].nunpack,MPI_FFT_SCALAR, @@ -533,6 +534,7 @@ void GridCommKokkos::forward_comm(KSpace *kspace, int which) } kspaceKKBase->unpack_forward_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); + DeviceType::fence(); } } @@ -554,6 +556,7 @@ void GridCommKokkos::reverse_comm(KSpace *kspace, int which) kspaceKKBase->pack_reverse_kspace_kokkos(which,k_buf2,swap[m].nunpack,k_unpacklist,m); else kspaceKKBase->pack_reverse_kspace_kokkos(which,k_buf1,swap[m].nunpack,k_unpacklist,m); + DeviceType::fence(); if (swap[m].recvproc != me) { MPI_Irecv(k_buf2.view().ptr_on_device(),nreverse*swap[m].npack,MPI_FFT_SCALAR, @@ -564,6 +567,7 @@ void GridCommKokkos::reverse_comm(KSpace *kspace, int which) } kspaceKKBase->unpack_reverse_kspace_kokkos(which,k_buf2,swap[m].npack,k_packlist,m); + DeviceType::fence(); } } diff --git a/src/MAKE/MACHINES/Makefile.white b/src/MAKE/MACHINES/Makefile.white index 53de76e736..42daf850f3 100644 --- a/src/MAKE/MACHINES/Makefile.white +++ b/src/MAKE/MACHINES/Makefile.white @@ -7,7 +7,7 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler KOKKOS_ABSOLUTE_PATH = $(shell cd $(KOKKOS_PATH); pwd) -export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/config/nvcc_wrapper +export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper CC = mpicxx CCFLAGS = -g -O3 SHFLAGS = -fPIC @@ -21,8 +21,8 @@ SIZE = size ARCHIVE = ar ARFLAGS = -rc SHLIBFLAGS = -shared -KOKKOS_DEVICES = Cuda, OpenMP -KOKKOS_ARCH = Kepler35 +KOKKOS_DEVICES = Cuda +KOKKOS_ARCH = Pascal60,Power8 # --------------------------------------------------------------------- # LAMMPS-specific settings, all OPTIONAL diff --git a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi index d6568a428f..54122c1b19 100644 --- a/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi +++ b/src/MAKE/OPTIONS/Makefile.kokkos_cuda_mpi @@ -7,8 +7,8 @@ SHELL = /bin/sh # specify flags and libraries needed for your compiler KOKKOS_ABSOLUTE_PATH = $(shell cd $(KOKKOS_PATH); pwd) -export MPICH_CXX = $(KOKKOS_ABSOLUTE_PATH)/config/nvcc_wrapper -export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/config/nvcc_wrapper +export MPICH_CXX = $(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper +export OMPI_CXX = $(KOKKOS_ABSOLUTE_PATH)/bin/nvcc_wrapper CC = mpicxx CCFLAGS = -g -O3 SHFLAGS = -fPIC @@ -22,7 +22,7 @@ SIZE = size ARCHIVE = ar ARFLAGS = -rc SHLIBFLAGS = -shared -KOKKOS_DEVICES = Cuda, OpenMP +KOKKOS_DEVICES = Cuda KOKKOS_ARCH = Kepler35 # --------------------------------------------------------------------- diff --git a/src/USER-DPD/fix_shardlow.cpp b/src/USER-DPD/fix_shardlow.cpp index cec53ab15f..39b7ba2224 100644 --- a/src/USER-DPD/fix_shardlow.cpp +++ b/src/USER-DPD/fix_shardlow.cpp @@ -48,7 +48,6 @@ #include "neighbor.h" #include "neigh_list.h" #include "neigh_request.h" -#include "random_mars.h" #include "memory.h" #include "domain.h" #include "modify.h" @@ -60,6 +59,7 @@ using namespace LAMMPS_NS; using namespace FixConst; +using namespace random_external_state; #define EPSILON 1.0e-10 #define EPSILON_SQUARED ((EPSILON) * (EPSILON)) @@ -87,6 +87,7 @@ static const char cite_fix_shardlow[] = FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) : Fix(lmp, narg, arg), pairDPD(NULL), pairDPDE(NULL), v_t0(NULL) + ,rand_state(NULL) { if (lmp->citeme) lmp->citeme->add(cite_fix_shardlow); @@ -99,6 +100,7 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) : if (pairDPDE == NULL) pairDPDE = (PairDPDfdtEnergy *) force->pair_match("dpd/fdt/energy/kk",1); + maxRNG = 0; if(pairDPDE){ comm_forward = 3; comm_reverse = 5; @@ -116,6 +118,8 @@ FixShardlow::FixShardlow(LAMMPS *lmp, int narg, char **arg) : FixShardlow::~FixShardlow() { + memory->destroy(rand_state); + maxRNG = 0; } /* ---------------------------------------------------------------------- */ @@ -173,12 +177,12 @@ void FixShardlow::setup(int vflag) NOTE: only implemented for orthogonal boxes, not triclinic ------------------------------------------------------------------------- */ void FixShardlow::ssa_update_dpd( - int i, - int *jlist, - int jlen + int start_ii, + int count, + int id ) { - class RanMars *pRNG; + es_RNG_t RNGstate = rand_state[id]; double **x = atom->x; double **v = atom->v; double *rmass = atom->rmass; @@ -192,6 +196,16 @@ void FixShardlow::ssa_update_dpd( const double dt = update->dt; + int ct = count; + int ii = start_ii; + +while (ct-- > 0) { + const int i = list->ilist[ii]; + const int *jlist = list->firstneigh[ii]; + const int jlen = list->numneigh[ii]; + ii++; + if (jlen <= 0) continue; + const double xtmp = x[i][0]; const double ytmp = x[i][1]; const double ztmp = x[i][2]; @@ -203,7 +217,6 @@ void FixShardlow::ssa_update_dpd( int itype = type[i]; - pRNG = pairDPD->random; cut2_i = pairDPD->cutsq[itype]; cut_i = pairDPD->cut[itype]; sigma_i = pairDPD->sigma[itype]; @@ -254,7 +267,7 @@ void FixShardlow::ssa_update_dpd( double halfsigma_ij = 0.5*sigma_i[jtype]; double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv; - double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian(); + double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * es_normal(RNGstate); double mass_j = (rmass) ? rmass[j] : mass[jtype]; double massinv_j = 1.0 / mass_j; @@ -316,6 +329,9 @@ void FixShardlow::ssa_update_dpd( v[i][2] = vzi; } + rand_state[id] = RNGstate; +} + /* ---------------------------------------------------------------------- Perform the stochastic integration and Shardlow update for constant energy Allow for both per-type and per-atom mass @@ -323,12 +339,12 @@ void FixShardlow::ssa_update_dpd( NOTE: only implemented for orthogonal boxes, not triclinic ------------------------------------------------------------------------- */ void FixShardlow::ssa_update_dpde( - int i, - int *jlist, - int jlen + int start_ii, + int count, + int id ) { - class RanMars *pRNG; + es_RNG_t RNGstate = rand_state[id]; double **x = atom->x; double **v = atom->v; double *rmass = atom->rmass; @@ -346,6 +362,16 @@ void FixShardlow::ssa_update_dpde( const double dt = update->dt; + int ct = count; + int ii = start_ii; + +while (ct-- > 0) { + const int i = list->ilist[ii]; + const int *jlist = list->firstneigh[ii]; + const int jlen = list->numneigh[ii]; + ii++; + if (jlen <= 0) continue; + const double xtmp = x[i][0]; const double ytmp = x[i][1]; const double ztmp = x[i][2]; @@ -359,7 +385,6 @@ void FixShardlow::ssa_update_dpde( double uCond_i = uCond[i]; int itype = type[i]; - pRNG = pairDPDE->random; cut2_i = pairDPDE->cutsq[itype]; cut_i = pairDPDE->cut[itype]; sigma_i = pairDPDE->sigma[itype]; @@ -415,7 +440,7 @@ void FixShardlow::ssa_update_dpde( double halfsigma_ij = 0.5*sigma_i[jtype]; double halfgamma_ij = halfsigma_ij*halfsigma_ij*boltz_inv*theta_ij_inv; - double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * pRNG->gaussian(); + double sigmaRand = halfsigma_ij*wr*dtsqrt*ftm2v * es_normal(RNGstate); double mass_j = (rmass) ? rmass[j] : mass[jtype]; double mass_ij_div_neg4_ftm2v = mass_j*mass_i_div_neg4_ftm2v; @@ -424,7 +449,7 @@ void FixShardlow::ssa_update_dpde( // Compute uCond double kappa_ij = kappa_i[jtype]; double alpha_ij = sqrt(boltz2*kappa_ij); - double del_uCond = alpha_ij*wr*dtsqrt * pRNG->gaussian(); + double del_uCond = alpha_ij*wr*dtsqrt * es_normal(RNGstate); del_uCond += kappa_ij*(theta_i_inv - theta_j_inv)*wdt; uCond[j] -= del_uCond; @@ -499,6 +524,9 @@ void FixShardlow::ssa_update_dpde( uCond[i] = uCond_i; } + rand_state[id] = RNGstate; +} + void FixShardlow::initial_integrate(int vflag) { int i,ii,inum; @@ -507,7 +535,6 @@ void FixShardlow::initial_integrate(int vflag) int nlocal = atom->nlocal; int nghost = atom->nghost; - int airnum; const bool useDPDE = (pairDPDE != NULL); // NOTE: this logic is specific to orthogonal boxes, not triclinic @@ -530,6 +557,31 @@ void FixShardlow::initial_integrate(int vflag) error->one(FLERR, msg); } + NPairHalfBinNewtonSSA *np_ssa = dynamic_cast(list->np); + if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairHalfBinNewtonSSA object"); + int ssa_phaseCt = np_ssa->ssa_phaseCt; + int *ssa_phaseLen = np_ssa->ssa_phaseLen; + int **ssa_itemLoc = np_ssa->ssa_itemLoc; + int **ssa_itemLen = np_ssa->ssa_itemLen; + int ssa_gphaseCt = np_ssa->ssa_gphaseCt; + int *ssa_gphaseLen = np_ssa->ssa_gphaseLen; + int **ssa_gitemLoc = np_ssa->ssa_gitemLoc; + int **ssa_gitemLen = np_ssa->ssa_gitemLen; + + int maxWorkItemCt = np_ssa->ssa_maxPhaseLen; + if (maxWorkItemCt > maxRNG) { + uint64_t my_seed = comm->me + (useDPDE ? pairDPDE->seed : pairDPD->seed); + es_RNG_t serial_rand_state; + es_init(serial_rand_state, my_seed); + + memory->grow(rand_state, maxWorkItemCt, "FixShardlow:rand_state"); + for (int i = 0; i < maxWorkItemCt; ++i) { + es_genNextParallelState(serial_rand_state, rand_state[i]); + } + + maxRNG = maxWorkItemCt; + } + #ifdef DEBUG_SSA_PAIR_CT for (int i = 0; i < 2; ++i) for (int j = 0; j < 3; ++j) @@ -545,13 +597,6 @@ void FixShardlow::initial_integrate(int vflag) dtsqrt = sqrt(update->dt); - NPairHalfBinNewtonSSA *np_ssa = dynamic_cast(list->np); - if (!np_ssa) error->one(FLERR, "NPair wasn't a NPairHalfBinNewtonSSA object"); - int ssa_phaseCt = np_ssa->ssa_phaseCt; - int *ssa_phaseLen = np_ssa->ssa_phaseLen; - int **ssa_itemLoc = np_ssa->ssa_itemLoc; - int **ssa_itemLen = np_ssa->ssa_itemLen; - // process neighbors in the local AIR for (int workPhase = 0; workPhase < ssa_phaseCt; ++workPhase) { int workItemCt = ssa_phaseLen[workPhase]; @@ -559,22 +604,14 @@ void FixShardlow::initial_integrate(int vflag) for (int workItem = 0; workItem < workItemCt; ++workItem) { int ct = ssa_itemLen[workPhase][workItem]; ii = ssa_itemLoc[workPhase][workItem]; - - while (ct-- > 0) { - int len = list->numneigh[ii]; - if (len > 0) { - if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len); - else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len); - } - ii++; - } + if (useDPDE) ssa_update_dpde(ii, ct, workItem); + else ssa_update_dpd(ii, ct, workItem); } } - ii = inum; //Loop over all 13 outward directions (7 stages) - for (airnum = 1; airnum <=7; airnum++){ - int ct = list->AIRct_ssa[airnum]; + for (int workPhase = 0; workPhase < ssa_gphaseCt; ++workPhase) { + int workItemCt = ssa_gphaseLen[workPhase]; // Communicate the updated velocities to all nodes comm->forward_comm_fix(this); @@ -585,12 +622,11 @@ void FixShardlow::initial_integrate(int vflag) memset(&(atom->uMech[nlocal]), 0, sizeof(double)*nghost); } - // process neighbors in this AIR - while (ct-- > 0) { - int len = list->numneigh[ii]; - if (useDPDE) ssa_update_dpde(ilist[ii], list->firstneigh[ii], len); - else ssa_update_dpd(ilist[ii], list->firstneigh[ii], len); - ii++; + for (int workItem = 0; workItem < workItemCt; ++workItem) { + int ct = ssa_gitemLen[workPhase][workItem]; + ii = ssa_gitemLoc[workPhase][workItem]; + if (useDPDE) ssa_update_dpde(ii, ct, workItem); + else ssa_update_dpd(ii, ct, workItem); } // Communicate the ghost deltas to the atom owners @@ -699,6 +735,7 @@ double FixShardlow::memory_usage() { double bytes = 0.0; bytes += sizeof(double)*3*atom->nghost; // v_t0[] + bytes += sizeof(*rand_state)*maxRNG; // rand_state[] return bytes; } diff --git a/src/USER-DPD/fix_shardlow.h b/src/USER-DPD/fix_shardlow.h index e8e5f484a0..21f7569a23 100644 --- a/src/USER-DPD/fix_shardlow.h +++ b/src/USER-DPD/fix_shardlow.h @@ -21,6 +21,8 @@ FixStyle(shardlow,FixShardlow) #define LMP_FIX_SHARDLOW_H #include "fix.h" +#include "random_external_state.h" +#include namespace LAMMPS_NS { @@ -52,12 +54,14 @@ class FixShardlow : public Fix { class PairDPDfdt *pairDPD; class PairDPDfdtEnergy *pairDPDE; double (*v_t0)[3]; + int maxRNG; private: + random_external_state::es_RNG_t *rand_state; double dtsqrt; // = sqrt(update->dt); - void ssa_update_dpd(int, int *, int); // Constant Temperature - void ssa_update_dpde(int, int *, int); // Constant Energy + void ssa_update_dpd(int, int, int); // Constant Temperature + void ssa_update_dpde(int, int, int); // Constant Energy }; diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.cpp b/src/USER-DPD/npair_half_bin_newton_ssa.cpp index a6479d4c4f..ce50f7603a 100644 --- a/src/USER-DPD/npair_half_bin_newton_ssa.cpp +++ b/src/USER-DPD/npair_half_bin_newton_ssa.cpp @@ -42,6 +42,10 @@ NPairHalfBinNewtonSSA::NPairHalfBinNewtonSSA(LAMMPS *lmp) : NPair(lmp) ssa_phaseLen = NULL; ssa_itemLoc = NULL; ssa_itemLen = NULL; + ssa_gphaseCt = 7; + memory->create(ssa_gphaseLen,ssa_gphaseCt,"NPairHalfBinNewtonSSA:ssa_gphaseLen"); + memory->create(ssa_gitemLoc,ssa_gphaseCt,1,"NPairHalfBinNewtonSSA:ssa_gitemLoc"); + memory->create(ssa_gitemLen,ssa_gphaseCt,1,"NPairHalfBinNewtonSSA:ssa_gitemLen"); } /* ---------------------------------------------------------------------- */ @@ -54,6 +58,10 @@ NPairHalfBinNewtonSSA::~NPairHalfBinNewtonSSA() memory->destroy(ssa_phaseLen); memory->destroy(ssa_itemLoc); memory->destroy(ssa_itemLen); + ssa_gphaseCt = 0; + memory->destroy(ssa_gphaseLen); + memory->destroy(ssa_gitemLoc); + memory->destroy(ssa_gitemLen); } /* ---------------------------------------------------------------------- @@ -236,13 +244,14 @@ void NPairHalfBinNewtonSSA::build(NeighList *list) if (ssa_phaseCt != workPhase) error->one(FLERR,"ssa_phaseCt was wrong"); - list->AIRct_ssa[0] = list->inum = inum; + list->inum = inum; // loop over AIR ghost atoms, storing their local neighbors // since these are ghosts, must check if stencil bin is out of bounds - for (int airnum = 1; airnum <= 7; airnum++) { + for (workPhase = 0; workPhase < ssa_gphaseCt; workPhase++) { int locAIRct = 0; - for (i = gairhead_ssa[airnum]; i >= 0; i = bins[i]) { + ssa_gitemLoc[workPhase][0] = inum + gnum; // record where workItem starts in ilist + for (i = gairhead_ssa[workPhase+1]; i >= 0; i = bins[i]) { n = 0; neighptr = ipage->vget(); @@ -305,7 +314,8 @@ void NPairHalfBinNewtonSSA::build(NeighList *list) if (ipage->status()) error->one(FLERR,"Neighbor (ghost) list overflow, boost neigh_modify one"); } - list->AIRct_ssa[airnum] = locAIRct; + ssa_gitemLen[workPhase][0] = locAIRct; + ssa_gphaseLen[workPhase] = 1; } list->gnum = gnum; } diff --git a/src/USER-DPD/npair_half_bin_newton_ssa.h b/src/USER-DPD/npair_half_bin_newton_ssa.h index ea292316ca..584d87e3ad 100644 --- a/src/USER-DPD/npair_half_bin_newton_ssa.h +++ b/src/USER-DPD/npair_half_bin_newton_ssa.h @@ -33,13 +33,18 @@ class NPairHalfBinNewtonSSA : public NPair { int *ssa_phaseLen; int **ssa_itemLoc; int **ssa_itemLen; + int ssa_gphaseCt; + int *ssa_gphaseLen; + int **ssa_gitemLoc; + int **ssa_gitemLen; + + int ssa_maxPhaseLen; NPairHalfBinNewtonSSA(class LAMMPS *); ~NPairHalfBinNewtonSSA(); void build(class NeighList *); private: int ssa_maxPhaseCt; - int ssa_maxPhaseLen; }; } diff --git a/src/USER-DPD/pair_dpd_fdt.h b/src/USER-DPD/pair_dpd_fdt.h index 5c20f2fc8f..84b09b0fa2 100644 --- a/src/USER-DPD/pair_dpd_fdt.h +++ b/src/USER-DPD/pair_dpd_fdt.h @@ -44,11 +44,11 @@ class PairDPDfdt : public Pair { double **sigma; double temperature; + int seed; class RanMars *random; protected: double cut_global; - int seed; bool splitFDT_flag; bool a0_is_zero; diff --git a/src/USER-DPD/random_external_state.h b/src/USER-DPD/random_external_state.h new file mode 100644 index 0000000000..d97d5a17ce --- /dev/null +++ b/src/USER-DPD/random_external_state.h @@ -0,0 +1,179 @@ +/* -*- c++ -*- ---------------------------------------------------------- + LAMMPS - Large-scale Atomic/Molecular Massively Parallel Simulator + http://lammps.sandia.gov, Sandia National Laboratories + Steve Plimpton, sjplimp@sandia.gov + + Copyright (2003) Sandia Corporation. Under the terms of Contract + DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains + certain rights in this software. This software is distributed under + the GNU General Public License. + + See the README file in the top-level LAMMPS directory. +------------------------------------------------------------------------- */ +/* + This random_external_state.h file was derrived from the Kokkos + file algorithms/src/Kokkos_Random.hpp and adapted to work + without Kokkos installed, as well as being converted to a form + that has no internal state. All RNG state information is kept + outside this "class", and is passed in by reference by the caller. + */ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 2.0 +// Copyright (2014) Sandia Corporation +// +// Under the terms of Contract DE-AC04-94AL85000 with Sandia Corporation, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY SANDIA CORPORATION "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL SANDIA CORPORATION OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact H. Carter Edwards (hcedwar@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef LMP_RANDOM_EXTERNALSTATE_H +#define LMP_RANDOM_EXTERNALSTATE_H + +#include +#include "accelerator_kokkos.h" + +#ifdef LMP_KOKKOS +#define RND_INLINE KOKKOS_INLINE_FUNCTION +#else +#define RND_INLINE inline +#endif + + +/// \file random_external_state.h +/// \brief Pseudorandom number generators +/// +/// These generators are based on Vigna, Sebastiano (2014). "An +/// experimental exploration of Marsaglia's xorshift generators, +/// scrambled." See: http://arxiv.org/abs/1402.6246 + +// A replacement for the Kokkos Random_XorShift64 class that uses +// an external state variable, instead of a class member variable. +namespace random_external_state { + typedef uint64_t es_RNG_t; + + enum {MAX_URAND = 0xffffffffU}; + enum {MAX_URAND64 = 0xffffffffffffffffULL-1}; + + RND_INLINE + uint32_t es_urand(es_RNG_t &state_) { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + + es_RNG_t tmp = state_ * 2685821657736338717ULL; + tmp = tmp>>16; + return static_cast(tmp&MAX_URAND); + } + + RND_INLINE + uint64_t es_urand64(es_RNG_t &state_) { + state_ ^= state_ >> 12; + state_ ^= state_ << 25; + state_ ^= state_ >> 27; + return (state_ * 2685821657736338717ULL) - 1; + } + + RND_INLINE + int es_rand(es_RNG_t &state_) { + return static_cast(es_urand(state_)/2); + } + + RND_INLINE + double es_drand(es_RNG_t &state_) { + return 1.0 * es_urand64(state_)/MAX_URAND64; + } + + //Marsaglia polar method for drawing a standard normal distributed random number + RND_INLINE + double es_normal(es_RNG_t &state_) { + double S, U; + do { + U = 2.0*es_drand(state_) - 1.0; + const double V = 2.0*es_drand(state_) - 1.0; + S = U*U+V*V; + } while ((S >= 1.0) || (S == 0.0)); + return U*sqrt(-2.0*log(S)/S); + } + + RND_INLINE + double es_normalPair(es_RNG_t &state_, double &second) { + double S, U, V; + do { + U = 2.0*es_drand(state_) - 1.0; + V = 2.0*es_drand(state_) - 1.0; + S = U*U+V*V; + } while ((S >= 1.0) || (S == 0.0)); + const double fac = sqrt(-2.0*log(S)/S); + second = V*fac; + return U*fac; + } + + // Use es_init() to init a serial RNG, that is then + // used to generate the initial state of your k parallel + // RNGs with k calls to genNextParallelState() + RND_INLINE + void es_init(es_RNG_t &serial_state, uint64_t seed) { + if(seed==0) seed = uint64_t(1318319); + serial_state = seed; + for(int i = 0; i < 17; i++) es_rand(serial_state); + } + + // Call genNextParallelState() once for each RNG to generate + // the initial state for that RNG. + RND_INLINE + void es_genNextParallelState(es_RNG_t &serial_state, es_RNG_t &new_state) { + int n1 = es_rand(serial_state); + int n2 = es_rand(serial_state); + int n3 = es_rand(serial_state); + int n4 = es_rand(serial_state); + new_state = ((((static_cast(n1)) & 0xffff)<<00) | + (((static_cast(n2)) & 0xffff)<<16) | + (((static_cast(n3)) & 0xffff)<<32) | + (((static_cast(n4)) & 0xffff)<<48)); + } +} + +#endif + +/* ERROR/WARNING messages: + +E: Invalid seed for Marsaglia random # generator + +The initial seed for this random number generator must be a positive +integer less than or equal to 900 million. + +*/ diff --git a/src/USER-REAXC/fix_qeq_reax.h b/src/USER-REAXC/fix_qeq_reax.h index 19efcd2b03..96a174b908 100644 --- a/src/USER-REAXC/fix_qeq_reax.h +++ b/src/USER-REAXC/fix_qeq_reax.h @@ -122,15 +122,15 @@ class FixQEqReax : public Fix { //int GMRES(double*,double*); virtual void sparse_matvec(sparse_matrix*,double*,double*); - int pack_forward_comm(int, int *, double *, int, int *); - void unpack_forward_comm(int, int, double *); - int pack_reverse_comm(int, int, double *); - void unpack_reverse_comm(int, int *, double *); - double memory_usage(); - void grow_arrays(int); - void copy_arrays(int, int, int); - int pack_exchange(int, double *); - int unpack_exchange(int, double *); + virtual int pack_forward_comm(int, int *, double *, int, int *); + virtual void unpack_forward_comm(int, int, double *); + virtual int pack_reverse_comm(int, int, double *); + virtual void unpack_reverse_comm(int, int *, double *); + virtual double memory_usage(); + virtual void grow_arrays(int); + virtual void copy_arrays(int, int, int); + virtual int pack_exchange(int, double *); + virtual int unpack_exchange(int, double *); virtual double parallel_norm( double*, int ); virtual double parallel_dot( double*, double*, int ); diff --git a/src/neigh_list.cpp b/src/neigh_list.cpp index 934b9f7d9b..93f4b13bf2 100644 --- a/src/neigh_list.cpp +++ b/src/neigh_list.cpp @@ -88,7 +88,6 @@ NeighList::NeighList(LAMMPS *lmp) : Pointers(lmp) // USER-DPD package - for (int i = 0; i < 8; i++) AIRct_ssa[i] = 0; np = NULL; } diff --git a/src/neigh_list.h b/src/neigh_list.h index d633ba839e..755a1bf134 100644 --- a/src/neigh_list.h +++ b/src/neigh_list.h @@ -92,7 +92,6 @@ class NeighList : protected Pointers { // USER-DPD package and Shardlow Splitting Algorithm (SSA) support - int AIRct_ssa[8]; // count of how many atoms in each AIR class NPair *np; // ptr to NPair instance I depend on // methods