diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index c759181aa2..3ce38c37d8 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,5 +1,168 @@ # Change Log +## [3.4.00](https://github.com/kokkos/kokkos/tree/3.4.00) (2021-04-25) +[Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.01...3.4.00) + +**Highlights:** +- SYCL Backend Almost Feature Complete +- OpenMPTarget Backend Almost Feature Complete +- Performance Improvements for HIP backend +- Require CMake 3.16 or newer +- Tool Callback Interface Enhancements +- cmath wrapper functions available now in Kokkos::Experimental + +**Features:** +- Implement parallel_scan with ThreadVectorRange and Reducer [\#3861](https://github.com/kokkos/kokkos/pull/3861) +- Implement SYCL Random [\#3849](https://github.com/kokkos/kokkos/pull/3849) +- OpenMPTarget: Adding Implementation for nested reducers [\#3845](https://github.com/kokkos/kokkos/pull/3845) +- Implement UniqueToken for SYCL [\#3833](https://github.com/kokkos/kokkos/pull/3833) +- OpenMPTarget: UniqueToken::Global implementation [\#3823](https://github.com/kokkos/kokkos/pull/3823) +- DualView sync's on ExecutionSpaces [\#3822](https://github.com/kokkos/kokkos/pull/3822) +- SYCL outer TeamPolicy parallel_reduce [\#3818](https://github.com/kokkos/kokkos/pull/3818) +- SYCL TeamPolicy::team_scan [\#3815](https://github.com/kokkos/kokkos/pull/3815) +- SYCL MDRangePolicy parallel_reduce [\#3801](https://github.com/kokkos/kokkos/pull/3801) +- Enable use of execution space instances in ScatterView [\#3786](https://github.com/kokkos/kokkos/pull/3786) +- SYCL TeamPolicy nested parallel_reduce [\#3783](https://github.com/kokkos/kokkos/pull/3783) +- OpenMPTarget: MDRange with TagType for parallel_for [\#3781](https://github.com/kokkos/kokkos/pull/3781) +- Adding OpenMPTarget parallel_scan [\#3655](https://github.com/kokkos/kokkos/pull/3655) +- SYCL basic TeamPolicy [\#3654](https://github.com/kokkos/kokkos/pull/3654) +- OpenMPTarget: scratch memory implementation [\#3611](https://github.com/kokkos/kokkos/pull/3611) + +**Implemented enhancements Backends and Archs:** +- SYCL choose a specific GPU [\#3918](https://github.com/kokkos/kokkos/pull/3918) +- [HIP] Lock access to scratch memory when using Teams [\#3916](https://github.com/kokkos/kokkos/pull/3916) +- [HIP] fix multithreaded access to get_next_driver [\#3908](https://github.com/kokkos/kokkos/pull/3908) +- Forward declare HIPHostPinnedSpace and SYCLSharedUSMSpace [\#3902](https://github.com/kokkos/kokkos/pull/3902) +- Let SYCL USMObjectMem use SharedAllocationRecord [\#3898](https://github.com/kokkos/kokkos/pull/3898) +- Implement clock_tic for SYCL [\#3893](https://github.com/kokkos/kokkos/pull/3893) +- Don't use a static variable in HIPInternal::scratch_space [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Reuse memory for SYCL parallel_reduce [\#3873](https://github.com/kokkos/kokkos/pull/3873) +- Update SYCL compiler in CI [\#3826](https://github.com/kokkos/kokkos/pull/3826) +- Introduce HostSharedPtr to manage m_space_instance for Cuda/HIP/SYCL [\#3824](https://github.com/kokkos/kokkos/pull/3824) +- [HIP] Use shuffle for range reduction [\#3811](https://github.com/kokkos/kokkos/pull/3811) +- OpenMPTarget: Changes to the hierarchical parallelism [\#3808](https://github.com/kokkos/kokkos/pull/3808) +- Remove ExtendedReferenceWrapper for SYCL parallel_reduce [\#3802](https://github.com/kokkos/kokkos/pull/3802) +- Eliminate sycl_indirect_launch [\#3777](https://github.com/kokkos/kokkos/pull/3777) +- OpenMPTarget: scratch implementation for parallel_reduce [\#3776](https://github.com/kokkos/kokkos/pull/3776) +- Allow initializing SYCL execution space from sycl::queue and SYCL::impl_static_fence [\#3767](https://github.com/kokkos/kokkos/pull/3767) +- SYCL TeamPolicy scratch memory alternative [\#3763](https://github.com/kokkos/kokkos/pull/3763) +- Alternative implementation for SYCL TeamPolicy [\#3759](https://github.com/kokkos/kokkos/pull/3759) +- Unify handling of synchronous errors in SYCL [\#3754](https://github.com/kokkos/kokkos/pull/3754) +- core/Cuda: Half_t updates for cgsolve [\#3746](https://github.com/kokkos/kokkos/pull/3746) +- Unify HIPParallelLaunch structures [\#3733](https://github.com/kokkos/kokkos/pull/3733) +- Improve performance for SYCL parallel_reduce [\#3732](https://github.com/kokkos/kokkos/pull/3732) +- Use consistent types in Kokkos_OpenMPTarget_Parallel.hpp [\#3703](https://github.com/kokkos/kokkos/pull/3703) +- Implement non-blocking kernel launches for HIP backend [\#3697](https://github.com/kokkos/kokkos/pull/3697) +- Change SYCLInternal::m_queue std::unique_ptr -> std::optional [\#3677](https://github.com/kokkos/kokkos/pull/3677) +- Use alternative SYCL parallel_reduce implementation [\#3671](https://github.com/kokkos/kokkos/pull/3671) +- Use runtime values in KokkosExp_MDRangePolicy.hpp [\#3626](https://github.com/kokkos/kokkos/pull/3626) +- Clean up AnalyzePolicy [\#3564](https://github.com/kokkos/kokkos/pull/3564) +- Changes for indirect launch of SYCL parallel reduce [\#3511](https://github.com/kokkos/kokkos/pull/3511) + +**Implemented enhancements BuildSystem:** +- Also require C++14 when building gtest [\#3912](https://github.com/kokkos/kokkos/pull/3912) +- Fix compiling SYCL with OpenMP [\#3874](https://github.com/kokkos/kokkos/pull/3874) +- Require C++17 for SYCL (at configuration time) [\#3869](https://github.com/kokkos/kokkos/pull/3869) +- Add COMPILE_DEFINITIONS argument to kokkos_create_imported_tpl [\#3862](https://github.com/kokkos/kokkos/pull/3862) +- Do not pass arch flags to the linker with no rdc [\#3846](https://github.com/kokkos/kokkos/pull/3846) +- Try compiling C++14 check with C++14 support and print error message [\#3843](https://github.com/kokkos/kokkos/pull/3843) +- Enable HIP with Cray Clang [\#3842](https://github.com/kokkos/kokkos/pull/3842) +- Add an option to disable header self containment tests [\#3834](https://github.com/kokkos/kokkos/pull/3834) +- CMake check for C++14 [\#3809](https://github.com/kokkos/kokkos/pull/3809) +- Prefer -std=* over --std=* [\#3779](https://github.com/kokkos/kokkos/pull/3779) +- Kokkos launch compiler updates [\#3778](https://github.com/kokkos/kokkos/pull/3778) +- Updated comments and enabled no-op for kokkos_launch_compiler [\#3774](https://github.com/kokkos/kokkos/pull/3774) +- Apple's Clang not correctly recognised [\#3772](https://github.com/kokkos/kokkos/pull/3772) +- kokkos_launch_compiler + CUDA auto-detect arch [\#3770](https://github.com/kokkos/kokkos/pull/3770) +- Add Spack test support for Kokkos [\#3753](https://github.com/kokkos/kokkos/pull/3753) +- Split SYCL tests for aot compilation [\#3741](https://github.com/kokkos/kokkos/pull/3741) +- Use consistent OpenMP flag for IntelClang [\#3735](https://github.com/kokkos/kokkos/pull/3735) +- Add support for -Wno-deprecated-gpu-targets [\#3722](https://github.com/kokkos/kokkos/pull/3722) +- Add configuration to target CUDA compute capability 8.6 [\#3713](https://github.com/kokkos/kokkos/pull/3713) +- Added VERSION and SOVERSION to KOKKOS_INTERNAL_ADD_LIBRARY [\#3706](https://github.com/kokkos/kokkos/pull/3706) +- Add fast-math to known NVCC flags [\#3699](https://github.com/kokkos/kokkos/pull/3699) +- Add MI-100 arch string [\#3698](https://github.com/kokkos/kokkos/pull/3698) +- Require CMake >=3.16 [\#3679](https://github.com/kokkos/kokkos/pull/3679) +- KokkosCI.cmake, KokkosCTest.cmake.in, CTestConfig.cmake.in + CI updates [\#2844](https://github.com/kokkos/kokkos/pull/2844) + +**Implemented enhancements Tools:** +- Improve readability of the callback invocation in profiling [\#3860](https://github.com/kokkos/kokkos/pull/3860) +- V1.1 Tools Interface: incremental, action-based [\#3812](https://github.com/kokkos/kokkos/pull/3812) +- Enable launch latency simulations [\#3721](https://github.com/kokkos/kokkos/pull/3721) +- Added metadata callback to tools interface [\#3711](https://github.com/kokkos/kokkos/pull/3711) +- MDRange Tile Size Tuning [\#3688](https://github.com/kokkos/kokkos/pull/3688) +- Added support for command-line args for kokkos-tools [\#3627](https://github.com/kokkos/kokkos/pull/3627) +- Query max tile sizes for an MDRangePolicy, and set tile sizes on an existing policy [\#3481](https://github.com/kokkos/kokkos/pull/3481) + +**Implemented enhancements Other:** +- Try detecting ndevices in get_gpu [\#3921](https://github.com/kokkos/kokkos/pull/3921) +- Use strcmp to compare names() [\#3909](https://github.com/kokkos/kokkos/pull/3909) +- Add execution space arguments for constructor overloads that might allocate a new underlying View [\#3904](https://github.com/kokkos/kokkos/pull/3904) +- Prefix labels in internal use of kokkos_malloc [\#3891](https://github.com/kokkos/kokkos/pull/3891) +- Prefix labels for internal uses of SharedAllocationRecord [\#3890](https://github.com/kokkos/kokkos/pull/3890) +- Add missing hypot math function [\#3880](https://github.com/kokkos/kokkos/pull/3880) +- Unify algorithm unit tests to avoid code duplication [\#3851](https://github.com/kokkos/kokkos/pull/3851) +- DualView.template view() better matches for Devices in UVMSpace cases [\#3857](https://github.com/kokkos/kokkos/pull/3857) +- More extensive disentangling of Policy Traits [\#3829](https://github.com/kokkos/kokkos/pull/3829) +- Replaced nanosleep and sched_yield with STL routines [\#3825](https://github.com/kokkos/kokkos/pull/3825) +- Constructing Atomic Subviews [\#3810](https://github.com/kokkos/kokkos/pull/3810) +- Metadata Declaration in Core [\#3729](https://github.com/kokkos/kokkos/pull/3729) +- Allow using tagged final functor in parallel_reduce [\#3714](https://github.com/kokkos/kokkos/pull/3714) +- Major duplicate code removal in SharedAllocationRecord specializations [\#3658](https://github.com/kokkos/kokkos/pull/3658) + +**Fixed bugs:** +- Provide forward declarations in Kokkos_ViewLayoutTiled.hpp for XL [\#3911](https://github.com/kokkos/kokkos/pull/3911) +- Fixup absolute value of floating points in Kokkos complex [\#3882](https://github.com/kokkos/kokkos/pull/3882) +- Address intel 17 ICE [\#3881](https://github.com/kokkos/kokkos/pull/3881) +- Add missing pow(Kokkos::complex) overloads [\#3868](https://github.com/kokkos/kokkos/pull/3868) +- Fix bug {pow, log}(Kokkos::complex) [\#3866](https://github.com/kokkos/kokkos/pull/3866)(https://github.com/kokkos/kokkos/pull/3866) +- Cleanup writing to output streams in Cuda [\#3859](https://github.com/kokkos/kokkos/pull/3859) +- Fixup cache CUDA fallback execution space instance used by DualView::sync [\#3856](https://github.com/kokkos/kokkos/pull/3856) +- Fix cmake warning with pthread [\#3854](https://github.com/kokkos/kokkos/pull/3854) +- Fix typo FOUND_CUDA_{DRIVVER -> DRIVER} [\#3852](https://github.com/kokkos/kokkos/pull/3852) +- Fix bug in SYCL team_reduce [\#3848](https://github.com/kokkos/kokkos/pull/3848) +- Atrocious bug in MDRange tuning [\#3803](https://github.com/kokkos/kokkos/pull/3803) +- Fix compiling SYCL with Kokkos_ENABLE_TUNING=ON [\#3800](https://github.com/kokkos/kokkos/pull/3800) +- Fixed command line parsing bug [\#3797](https://github.com/kokkos/kokkos/pull/3797) +- Workaround race condition in SYCL parallel_reduce [\#3782](https://github.com/kokkos/kokkos/pull/3782) +- Fix Atomic{Min,Max} for Kepler30 [\#3780](https://github.com/kokkos/kokkos/pull/3780) +- Fix SYCL typo [\#3755](https://github.com/kokkos/kokkos/pull/3755) +- Fixed Kokkos_install_additional_files macro [\#3752](https://github.com/kokkos/kokkos/pull/3752) +- Fix a typo for Kokkos_ARCH_A64FX [\#3751](https://github.com/kokkos/kokkos/pull/3751) +- OpenMPTarget: fixes and workarounds to work with "Release" build type [\#3748](https://github.com/kokkos/kokkos/pull/3748) +- Fix parsing bug for number of devices command line argument [\#3724](https://github.com/kokkos/kokkos/pull/3724) +- Avoid more warnings with clang and C++20 [\#3719](https://github.com/kokkos/kokkos/pull/3719) +- Fix gcc-10.1 C++20 warnings [\#3718](https://github.com/kokkos/kokkos/pull/3718) +- Fix cuda cache config not being set correct [\#3712](https://github.com/kokkos/kokkos/pull/3712) +- Fix dualview deepcopy perftools [\#3701](https://github.com/kokkos/kokkos/pull/3701) +- use drand instead of frand in drand [\#3696](https://github.com/kokkos/kokkos/pull/3696) + +**Incompatibilities:** +- Remove unimplemented member functions of SYCLDevice [\#3919](https://github.com/kokkos/kokkos/pull/3919) +- Replace cl::sycl [\#3896](https://github.com/kokkos/kokkos/pull/3896) +- Get rid of SYCL workaround in Kokkos_Complex.hpp [\#3884](https://github.com/kokkos/kokkos/pull/3884) +- Replace most uses of if_c [\#3883](https://github.com/kokkos/kokkos/pull/3883) +- Remove Impl::enable_if_type [\#3863](https://github.com/kokkos/kokkos/pull/3863) +- Remove HostBarrier test [\#3847](https://github.com/kokkos/kokkos/pull/3847) +- Avoid (void) interface [\#3836](https://github.com/kokkos/kokkos/pull/3836) +- Remove VerifyExecutionCanAccessMemorySpace [\#3813](https://github.com/kokkos/kokkos/pull/3813) +- Avoid duplicated code in ScratchMemorySpace [\#3793](https://github.com/kokkos/kokkos/pull/3793) +- Remove superfluous FunctorFinal specialization [\#3788](https://github.com/kokkos/kokkos/pull/3788) +- Rename cl::sycl -> sycl in Kokkos_MathematicalFunctions.hpp [\#3678](https://github.com/kokkos/kokkos/pull/3678) +- Remove integer_sequence backward compatibility implementation [\#3533](https://github.com/kokkos/kokkos/pull/3533) + +**Enabled tests:** +- Fixup re-enable core performance tests [\#3903](https://github.com/kokkos/kokkos/pull/3903) +- Enable more SYCL tests [\#3900](https://github.com/kokkos/kokkos/pull/3900) +- Restrict MDRange Policy tests for Intel GPUs [\#3853](https://github.com/kokkos/kokkos/pull/3853) +- Disable death tests for rawhide [\#3844](https://github.com/kokkos/kokkos/pull/3844) +- OpenMPTarget: Block unit tests that do not pass with the nvidia compiler [\#3839](https://github.com/kokkos/kokkos/pull/3839) +- Enable Bitset container test for SYCL [\#3830](https://github.com/kokkos/kokkos/pull/3830) +- Enable some more SYCL tests [\#3744](https://github.com/kokkos/kokkos/pull/3744) +- Enable SYCL atomic tests [\#3742](https://github.com/kokkos/kokkos/pull/3742) +- Enable more SYCL perf_tests [\#3692](https://github.com/kokkos/kokkos/pull/3692) +- Enable examples for SYCL [\#3691](https://github.com/kokkos/kokkos/pull/3691) + ## [3.3.01](https://github.com/kokkos/kokkos/tree/3.3.01) (2021-01-06) [Full Changelog](https://github.com/kokkos/kokkos/compare/3.3.00...3.3.01) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 7bc3c77256..6fc1bf7d2f 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -72,7 +72,7 @@ ENDFUNCTION() LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) IF(NOT KOKKOS_HAS_TRILINOS) - cmake_minimum_required(VERSION 3.10 FATAL_ERROR) + cmake_minimum_required(VERSION 3.16 FATAL_ERROR) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) IF (Spack_WORKAROUND) @@ -111,27 +111,25 @@ ENDIF() set(Kokkos_VERSION_MAJOR 3) -set(Kokkos_VERSION_MINOR 3) -set(Kokkos_VERSION_PATCH 1) +set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_PATCH 00) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") math(EXPR KOKKOS_VERSION "${Kokkos_VERSION_MAJOR} * 10000 + ${Kokkos_VERSION_MINOR} * 100 + ${Kokkos_VERSION_PATCH}") -IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") - CMAKE_POLICY(SET CMP0074 NEW) -ENDIF() +MESSAGE(STATUS "Setting policy CMP0074 to use _ROOT variables") +CMAKE_POLICY(SET CMP0074 NEW) # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) ENDIF() @@ -139,15 +137,19 @@ ENDIF() # I really wish these were regular variables # but scoping issues can make it difficult GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +GLOBAL_SET(KOKKOS_LINK_OPTIONS) GLOBAL_SET(KOKKOS_CUDA_OPTIONS) GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos GLOBAL_SET(KOKKOS_TPL_EXPORTS) -# this could probably be scoped to project +# KOKKOS_DEPENDENCE is used by kokkos_launch_compiler GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +# MSVC never goes through kokkos_launch_compiler +IF(NOT MSVC) + GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +ENDIF() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 061b7a46ee..aa97f99b75 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -11,8 +11,8 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 3 -KOKKOS_VERSION_MINOR = 3 -KOKKOS_VERSION_PATCH = 1 +KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_PATCH = 00 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) # Options: Cuda,HIP,OpenMP,Pthread,Serial @@ -20,7 +20,7 @@ KOKKOS_DEVICES ?= "OpenMP" #KOKKOS_DEVICES ?= "Pthread" # Options: # Intel: KNC,KNL,SNB,HSW,BDW,SKX -# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80 +# NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX # IBM: BGQ,Power7,Power8,Power9 # AMD-GPUS: Vega900,Vega906,Vega908 @@ -164,17 +164,17 @@ KOKKOS_INTERNAL_OS_DARWIN := $(call kokkos_has_string,$(KOKKOS_OS),Darwin) KOKKOS_CXX_VERSION := $(strip $(shell $(CXX) --version 2>&1)) KOKKOS_INTERNAL_COMPILER_INTEL := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Intel Corporation) KOKKOS_INTERNAL_COMPILER_PGI := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),PGI) -KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep XL | wc -l)) -KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep "CC-" | wc -l)) -KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep nvcc | wc -l)>0" | bc)) +KOKKOS_INTERNAL_COMPILER_XL := $(strip $(shell $(CXX) -qversion 2>&1 | grep -c XL)) +KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "CC-")) +KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple LLVM) +KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) KOKKOS_INTERNAL_COMPILER_GCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),GCC) # Check Host Compiler if using NVCC through nvcc_wrapper ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC), 1) - KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep nvcc_wrapper | wc -l)) + KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER := $(strip $(shell echo $(CXX) | grep -c nvcc_wrapper)) ifeq ($(KOKKOS_INTERNAL_COMPILER_NVCC_WRAPPER), 1) KOKKOS_CXX_HOST_VERSION := $(strip $(shell $(CXX) $(CXXFLAGS) --host-version 2>&1)) @@ -297,11 +297,11 @@ else #KOKKOS_INTERNAL_CXX1Z_FLAG := -hstd=c++1z #KOKKOS_INTERNAL_CXX2A_FLAG := -hstd=c++2a else - KOKKOS_INTERNAL_CXX14_FLAG := --std=c++14 - KOKKOS_INTERNAL_CXX1Y_FLAG := --std=c++1y - KOKKOS_INTERNAL_CXX17_FLAG := --std=c++17 - KOKKOS_INTERNAL_CXX1Z_FLAG := --std=c++1z - KOKKOS_INTERNAL_CXX2A_FLAG := --std=c++2a + KOKKOS_INTERNAL_CXX14_FLAG := -std=c++14 + KOKKOS_INTERNAL_CXX1Y_FLAG := -std=c++1y + KOKKOS_INTERNAL_CXX17_FLAG := -std=c++17 + KOKKOS_INTERNAL_CXX1Z_FLAG := -std=c++1z + KOKKOS_INTERNAL_CXX2A_FLAG := -std=c++2a endif endif endif @@ -332,6 +332,7 @@ KOKKOS_INTERNAL_USE_ARCH_VOLTA70 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volt KOKKOS_INTERNAL_USE_ARCH_VOLTA72 := $(call kokkos_has_string,$(KOKKOS_ARCH),Volta72) KOKKOS_INTERNAL_USE_ARCH_TURING75 := $(call kokkos_has_string,$(KOKKOS_ARCH),Turing75) KOKKOS_INTERNAL_USE_ARCH_AMPERE80 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere80) +KOKKOS_INTERNAL_USE_ARCH_AMPERE86 := $(call kokkos_has_string,$(KOKKOS_ARCH),Ampere86) KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLER30) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER32) \ + $(KOKKOS_INTERNAL_USE_ARCH_KEPLER35) \ @@ -344,7 +345,8 @@ KOKKOS_INTERNAL_USE_ARCH_NVIDIA := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_KEPLE + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA70) \ + $(KOKKOS_INTERNAL_USE_ARCH_VOLTA72) \ + $(KOKKOS_INTERNAL_USE_ARCH_TURING75) \ - + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80)) + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE80) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMPERE86)) #SEK: This seems like a bug to me ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) @@ -585,10 +587,10 @@ ifeq ($(KOKKOS_INTERNAL_ENABLE_PROFILING_LOAD_PRINT), 1) endif ifeq ($(KOKKOS_INTERNAL_ENABLE_TUNING), 1) - tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_TUNING") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_TUNING") endif -tmp := $(call kokkos_append_header,"\#define KOKKOS_ENABLE_LIBDL") +tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_LIBDL") ifeq ($(KOKKOS_INTERNAL_USE_HWLOC), 1) ifneq ($(KOKKOS_CMAKE), yes) @@ -752,6 +754,14 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_A64FX), 1) KOKKOS_CXXFLAGS += -march=armv8.2-a+sve KOKKOS_LDFLAGS += -march=armv8.2-a+sve + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_GCC), 1) + KOKKOS_CXXFLAGS += -msve-vector-bits=512 + KOKKOS_LDFLAGS += -msve-vector-bits=512 + endif endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ZEN), 1) @@ -1100,6 +1110,11 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA_ARCH), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 + endif ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) @@ -1159,7 +1174,7 @@ endif KOKKOS_INTERNAL_LS_CONFIG := $(shell ls KokkosCore_config.h 2>&1) ifeq ($(KOKKOS_INTERNAL_LS_CONFIG), KokkosCore_config.h) - KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep define | wc -l)) + KOKKOS_INTERNAL_NEW_CONFIG := $(strip $(shell diff KokkosCore_config.h KokkosCore_config.tmp | grep -c define)) else KOKKOS_INTERNAL_NEW_CONFIG := 1 endif @@ -1181,41 +1196,41 @@ tmp := $(call kokkos_update_config_header, KOKKOS_SETUP_HPP_, "KokkosCore_Config tmp := $(call kokkos_update_config_header, KOKKOS_DECLARE_HPP_, "KokkosCore_Config_DeclareBackend.tmp", "KokkosCore_Config_DeclareBackend.hpp") tmp := $(call kokkos_update_config_header, KOKKOS_POST_INCLUDE_HPP_, "KokkosCore_Config_PostInclude.tmp", "KokkosCore_Config_PostInclude.hpp") ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") ifeq ($(KOKKOS_INTERNAL_CUDA_USE_UVM), 1) else endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_SetupBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_SetupBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif ifeq ($(KOKKOS_INTERNAL_USE_MEMKIND), 1) - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_FwdBackend.hpp") - tmp := $(call kokkos_append_config_header,"\#include ","KokkosCore_Config_DeclareBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_FwdBackend.hpp") + tmp := $(call kokkos_append_config_header,"$H""include ","KokkosCore_Config_DeclareBackend.hpp") endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) @@ -1334,7 +1349,7 @@ ifneq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) endif # With Cygwin functions such as fdopen and fileno are not defined -# when strict ansi is enabled. strict ansi gets enabled with --std=c++14 +# when strict ansi is enabled. strict ansi gets enabled with -std=c++14 # though. So we hard undefine it here. Not sure if that has any bad side effects # This is needed for gtest actually, not for Kokkos itself! ifeq ($(KOKKOS_INTERNAL_OS_CYGWIN), 1) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index 5a03f7d17e..cf9fc24242 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -36,6 +36,8 @@ Kokkos_MemorySpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_ $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_MemorySpace.cpp Kokkos_HostSpace_deepcopy.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace_deepcopy.cpp +Kokkos_NumericTraits.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_NumericTraits.cpp ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 69d6cf8f35..904cf5ccb9 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -668,6 +668,25 @@ struct Random_UniqueIndex { }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct Random_UniqueIndex { + using locks_view_type = View; + KOKKOS_FUNCTION + static int get_state_idx(const locks_view_type& locks_) { +#ifdef KOKKOS_ARCH_INTEL_GEN + int i = Kokkos::Impl::clock_tic() % locks_.extent(0); +#else + int i = 0; +#endif + while (Kokkos::atomic_compare_exchange(&locks_(i), 0, 1)) { + i = (i + 1) % static_cast(locks_.extent(0)); + } + return i; + } +}; +#endif + } // namespace Impl template @@ -1028,7 +1047,7 @@ class Random_XorShift1024 { KOKKOS_INLINE_FUNCTION double drand(const double& start, const double& end) { - return frand(end - start) + start; + return drand(end - start) + start; } // Marsaglia polar method for drawing a standard normal distributed random diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index 819c9e54ba..9109837985 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -3,6 +3,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) SET(GTEST_SOURCE_DIR ${${PARENT_PACKAGE_NAME}_SOURCE_DIR}/tpls/gtest) @@ -25,7 +26,7 @@ KOKKOS_ADD_TEST_LIBRARY( TARGET_COMPILE_DEFINITIONS(kokkosalgorithms_gtest PUBLIC GTEST_HAS_TR1_TUPLE=0 GTEST_HAS_PTHREAD=0) IF((NOT (Kokkos_ENABLE_CUDA AND WIN32)) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) -TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_11) + TARGET_COMPILE_FEATURES(kokkosalgorithms_gtest PUBLIC cxx_std_14) ENDIF() # Suppress clang-tidy diagnostics on code that we do not have control over @@ -33,51 +34,42 @@ IF(CMAKE_CXX_CLANG_TIDY) SET_TARGET_PROPERTIES(kokkosalgorithms_gtest PROPERTIES CXX_CLANG_TIDY "") ENDIF() -SET(SOURCES - UnitTestMain.cpp -) +SET(ALGORITHM UnitTestMain.cpp) IF(Kokkos_ENABLE_OPENMP) - LIST( APPEND SOURCES - TestOpenMP.cpp + LIST(APPEND ALGORITHM_SOURCES TestOpenMP_Sort1D.cpp TestOpenMP_Sort3D.cpp TestOpenMP_SortDynamicView.cpp - TestOpenMP_Random.cpp ) ENDIF() -IF(Kokkos_ENABLE_HIP) - LIST( APPEND SOURCES - TestHIP.cpp - ) -ENDIF() +foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL) + # Because there is always an exception to the rule + if(Tag STREQUAL "Threads") + set(DEVICE "PTHREAD") + else() + string(TOUPPER ${Tag} DEVICE) + endif() -IF(Kokkos_ENABLE_CUDA) - LIST( APPEND SOURCES - TestCuda.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_HPX) - LIST( APPEND SOURCES - TestHPX.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_SERIAL) - LIST( APPEND SOURCES - TestSerial.cpp - ) -ENDIF() - -IF(Kokkos_ENABLE_PTHREAD) - LIST( APPEND SOURCES - TestThreads.cpp - ) -ENDIF() + if(Kokkos_ENABLE_${DEVICE}) + set(dir ${CMAKE_CURRENT_BINARY_DIR}) + set(file ${dir}/Test${Tag}.cpp) + # Write to a temporary intermediate file and call configure_file to avoid + # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. + file(WRITE ${dir}/dummy.cpp + "#include \n" + "#include \n" + "#include \n" + ) + configure_file(${dir}/dummy.cpp ${file}) + list(APPEND ALGORITHM_SOURCES ${file}) + endif() +endforeach() KOKKOS_ADD_EXECUTABLE_AND_TEST( UnitTest - SOURCES ${SOURCES} + SOURCES + UnitTestMain.cpp + ${ALGORITHM_SOURCES} ) diff --git a/lib/kokkos/algorithms/unit_tests/Makefile b/lib/kokkos/algorithms/unit_tests/Makefile index c112d7c6fc..dd0aa87de0 100644 --- a/lib/kokkos/algorithms/unit_tests/Makefile +++ b/lib/kokkos/algorithms/unit_tests/Makefile @@ -20,11 +20,19 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/algorithms/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = +tmp := $(foreach device, $(KOKKOS_DEVICELIST), \ + $(if $(filter Test$(device).cpp, $(shell ls Test$(device).cpp 2>/dev/null)),,\ + $(shell echo "\#include " > Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + $(shell echo "\#include " >> Test$(device).cpp); \ + ) \ +) + ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) OBJ_CUDA = TestCuda.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_Cuda @@ -44,7 +52,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_PTHREADS), 1) endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) - OBJ_OPENMP = TestOpenMP.o TestOpenMP_Random.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o + OBJ_OPENMP = TestOpenMP.o TestOpenMP_Sort1D.o TestOpenMP_Sort3D.o TestOpenMP_SortDynamicView.o UnitTestMain.o gtest-all.o TARGETS += KokkosAlgorithms_UnitTest_OpenMP TEST_TARGETS += test-openmp endif diff --git a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp index a9b2010ad0..4a5839f0c8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestOpenMP_Sort1D.cpp @@ -59,6 +59,8 @@ TEST(openmp, SortUnsigned1D) { Impl::test_1D_sort(171); } +TEST(openmp, SortIssue1160) { Impl::test_issue_1160_sort(); } + } // namespace Test #else void KOKKOS_ALGORITHMS_UNITTESTS_TESTOPENMP_PREVENT_LINK_ERROR() {} diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index caba92c152..1f14875096 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -491,6 +491,34 @@ void test_random(unsigned int num_draws) { } } // namespace Impl +template +void test_random_xorshift64() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 132141141; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10240000; +#endif + Impl::test_random>(num_draws); + Impl::test_random>>( + num_draws); +} + +template +void test_random_xorshift1024() { +#if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ + defined(KOKKOS_ENABLE_HIP) + const int num_draws = 52428813; +#else // SERIAL, HPX, OPENMP + const int num_draws = 10130144; +#endif + Impl::test_random>( + num_draws); + Impl::test_random>>( + num_draws); +} } // namespace Test #endif // KOKKOS_TEST_UNORDERED_MAP_HPP diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp new file mode 100644 index 0000000000..c6d3b59ae1 --- /dev/null +++ b/lib/kokkos/algorithms/unit_tests/TestRandomCommon.hpp @@ -0,0 +1,60 @@ +/* +//@HEADER +// ************************************************************************ +// +// Kokkos v. 3.0 +// Copyright (2020) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// 1. Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// +// 2. Redistributions in binary form must reproduce the above copyright +// notice, this list of conditions and the following disclaimer in the +// documentation and/or other materials provided with the distribution. +// +// 3. Neither the name of the Corporation nor the names of the +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY +// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE +// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Questions? Contact Christian R. Trott (crtrott@sandia.gov) +// +// ************************************************************************ +//@HEADER +*/ + +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTRANDOM_COMMON_HPP + +#include + +namespace Test { + +TEST(TEST_CATEGORY, Random_XorShift64) { + test_random_xorshift64(); +} +TEST(TEST_CATEGORY, Random_XorShift1024_0) { + test_random_xorshift1024(); +} +} // namespace Test + +#endif diff --git a/lib/kokkos/containers/unit_tests/TestCuda_Category.hpp b/lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp similarity index 88% rename from lib/kokkos/containers/unit_tests/TestCuda_Category.hpp rename to lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp index 50935d7a34..56657b6574 100644 --- a/lib/kokkos/containers/unit_tests/TestCuda_Category.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortCommon.hpp @@ -42,10 +42,14 @@ //@HEADER */ -#ifndef KOKKOS_TEST_CUDA_HPP -#define KOKKOS_TEST_CUDA_HPP +#ifndef KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP +#define KOKKOS_ALGORITHMS_UNITTESTS_TESTSORT_COMMON_HPP -#define TEST_CATEGORY cuda -#define TEST_EXECSPACE Kokkos::Cuda +#include +namespace Test { +TEST(TEST_CATEGORY, SortUnsigned) { + Impl::test_sort(171); +} +} // namespace Test #endif diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml index c40bf066b7..e8763c0b66 100644 --- a/lib/kokkos/appveyor.yml +++ b/lib/kokkos/appveyor.yml @@ -3,8 +3,4 @@ image: clone_folder: c:\projects\source build_script: - cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON && - cmake --build . --target install && - ctest -C Debug -V + cmake c:\projects\source -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc /d1reportClassLayoutChanges" -DCTEST_ARGS="-C Debug -V --output-on-failure" -DBUILD_NAME=MSVC-2019 -DBUILD_TYPE=Debug -DSITE=AppVeyor -DTARGET=install -P cmake/KokkosCI.cmake diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler index 1fbebf648f..d929d24f1d 100755 --- a/lib/kokkos/bin/kokkos_launch_compiler +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -13,6 +13,17 @@ # $1 are 'ar', 'cmake', etc. during the linking phase # +# emit a message about the underlying command executed +: ${DEBUG:=0} +: ${KOKKOS_DEBUG_LAUNCH_COMPILER:=${DEBUG}} + +debug-message() +{ + if [ "${KOKKOS_DEBUG_LAUNCH_COMPILER}" -ne 0 ]; then + echo -e "##### $(basename ${BASH_SOURCE[0]}) executing: \"$@\"... #####" + fi +} + # check the arguments for the KOKKOS_DEPENDENCE compiler definition KOKKOS_DEPENDENCE=0 for i in ${@} @@ -23,16 +34,30 @@ do fi done -# if C++ is not passed, someone is probably trying to invoke it directly +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly if [ -z "${1}" ]; then - echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the first argument." + echo -e "\n${BASH_SOURCE[0]} was invoked without the Kokkos compiler as the first argument." echo "This script is not indended to be directly invoked by any mechanism other" - echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake\n" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" + exit 1 +fi + +# if Kokkos compiler is not passed, someone is probably trying to invoke it directly +if [ -z "${2}" ]; then + echo -e "\n${BASH_SOURCE[0]} was invoked without the C++ compiler as the second argument." + echo "This script is not indended to be directly invoked by any mechanism other" + echo -e "than through a RULE_LAUNCH_COMPILE or RULE_LAUNCH_LINK property set in CMake.\n" exit 1 fi # if there aren't two args, this isn't necessarily invalid, just a bit strange -if [ -z "${2}" ]; then exit 0; fi +if [ -z "${3}" ]; then exit 0; fi + +# store the Kokkos compiler +KOKKOS_COMPILER=${1} + +# remove the Kokkos compiler from the arguments +shift # store the expected C++ compiler CXX_COMPILER=${1} @@ -40,48 +65,57 @@ CXX_COMPILER=${1} # remove the expected C++ compiler from the arguments shift -# after the above shift, $1 is now the exe for the compile or link command, e.g. -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# NOTE: in below, ${KOKKOS_COMPILER} is usually nvcc_wrapper +# +# after the above shifts, $1 is now the exe for the compile or link command, e.g. +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # becomes: # kokkos_launch_compiler gcc -c file.c -o file.o -# Check to see if the executable is the C++ compiler and if it is not, then +# We check to see if the executable is the C++ compiler and if it is not, then # just execute the command. # # Summary: -# kokkos_launch_compiler g++ gcc -c file.c -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ gcc -c file.c -o file.o # results in this command being executed: # gcc -c file.c -o file.o # and -# kokkos_launch_compiler g++ g++ -c file.cpp -o file.o +# kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: -# nvcc_wrapper -c file.cpp -o file.o +# ${KOKKOS_COMPILER} -c file.cpp -o file.o if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then - # the command does not depend on Kokkos so just execute the command w/o re-directing to nvcc_wrapper + debug-message $@ + # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} eval $@ else - # the executable is the C++ compiler, so we need to re-direct to nvcc_wrapper - - # find the nvcc_wrapper from the same build/install - NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" - - if [ -z "${NVCC_WRAPPER}" ]; then - echo -e "\nError: nvcc_wrapper not found in $(dirname ${BASH_SOURCE[0]}).\n" + # the executable is the C++ compiler, so we need to re-direct to ${KOKKOS_COMPILER} + if [ ! -f "${KOKKOS_COMPILER}" ]; then + echo -e "\nError: the compiler redirect for Kokkos was not found at ${KOKKOS_COMPILER}\n" exit 1 fi - # set default nvcc wrapper compiler if not specified - : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} - export NVCC_WRAPPER_DEFAULT_COMPILER + # find the nvcc_wrapper from the same build/install + NVCC_WRAPPER="$(dirname ${BASH_SOURCE[0]})/nvcc_wrapper" + if [ "${KOKKOS_COMPILER}" = "${NVCC_WRAPPER}" ]; then + # this should only be valid in the install tree -- it will be set to CMAKE_CXX_COMPILER used using Kokkos installation + if [ -z $(echo "@NVCC_WRAPPER_DEFAULT_COMPILER@" | grep 'NVCC_WRAPPER_DEFAULT_COMPILER') ]; then + : ${NVCC_WRAPPER_DEFAULT_COMPILER:="@NVCC_WRAPPER_DEFAULT_COMPILER@"} + fi - # calling itself will cause an infinitely long build - if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then - echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" - exit 1 + # set default nvcc wrapper compiler if not specified + : ${NVCC_WRAPPER_DEFAULT_COMPILER:=${CXX_COMPILER}} + export NVCC_WRAPPER_DEFAULT_COMPILER + + # nvcc_wrapper calling itself will cause an infinitely long build + if [ "${NVCC_WRAPPER}" = "${NVCC_WRAPPER_DEFAULT_COMPILER}" ]; then + echo -e "\nError: NVCC_WRAPPER == NVCC_WRAPPER_DEFAULT_COMPILER. Terminating to avoid infinite loop!\n" + exit 1 + fi fi # discard the compiler from the command shift - # execute nvcc_wrapper - ${NVCC_WRAPPER} $@ + debug-message ${KOKKOS_COMPILER} $@ + # execute ${KOKKOS_COMPILER} (again, usually nvcc_wrapper) + ${KOKKOS_COMPILER} $@ fi diff --git a/lib/kokkos/bin/nvcc_wrapper b/lib/kokkos/bin/nvcc_wrapper index 4ecf4c66d5..5556e888e3 100755 --- a/lib/kokkos/bin/nvcc_wrapper +++ b/lib/kokkos/bin/nvcc_wrapper @@ -191,11 +191,11 @@ do shift ;; #Handle known nvcc args - --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--Wext-lambda-captures-this|-Wext-lambda-captures-this) + --dryrun|--verbose|--keep|--keep-dir*|-G|--relocatable-device-code*|-lineinfo|-expt-extended-lambda|-expt-relaxed-constexpr|--resource-usage|-Xptxas*|--fmad*|--use_fast_math|--Wext-lambda-captures-this|-Wext-lambda-captures-this) cuda_args="$cuda_args $1" ;; #Handle more known nvcc args - --expt-extended-lambda|--expt-relaxed-constexpr) + --expt-extended-lambda|--expt-relaxed-constexpr|--Wno-deprecated-gpu-targets|-Wno-deprecated-gpu-targets) cuda_args="$cuda_args $1" ;; #Handle known nvcc args that have an argument diff --git a/lib/kokkos/cmake/CTestConfig.cmake.in b/lib/kokkos/cmake/CTestConfig.cmake.in new file mode 100644 index 0000000000..1f82c0d64d --- /dev/null +++ b/lib/kokkos/cmake/CTestConfig.cmake.in @@ -0,0 +1,91 @@ +#----------------------------------------------------------------------------------------# +# +# CTestConfig.cmake template for Kokkos +# +#----------------------------------------------------------------------------------------# + +# +# dash-board related +# +set(CTEST_PROJECT_NAME "Kokkos") +set(CTEST_NIGHTLY_START_TIME "01:00:00 UTC") +set(CTEST_DROP_METHOD "https") +set(CTEST_DROP_SITE "cdash.nersc.gov") +set(CTEST_DROP_LOCATION "/submit.php?project=${CTEST_PROJECT_NAME}") +set(CTEST_CDASH_VERSION "1.6") +set(CTEST_CDASH_QUERY_VERSION TRUE) +set(CTEST_SUBMIT_RETRY_COUNT "1") +set(CTEST_SUBMIT_RETRY_DELAY "30") + +# +# configure/build related +# +set(CTEST_BUILD_NAME "@BUILD_NAME@") +set(CTEST_MODEL "@MODEL@") +set(CTEST_SITE "@SITE@") +set(CTEST_CONFIGURATION_TYPE "@BUILD_TYPE@") +set(CTEST_SOURCE_DIRECTORY "@SOURCE_REALDIR@") +set(CTEST_BINARY_DIRECTORY "@BINARY_REALDIR@") + +# +# configure/build related +# +set(CTEST_UPDATE_TYPE "git") +set(CTEST_UPDATE_VERSION_ONLY ON) +# set(CTEST_GENERATOR "") +# set(CTEST_GENERATOR_PLATFORM "") + +# +# testing related +# +set(CTEST_TIMEOUT "7200") +set(CTEST_TEST_TIMEOUT "7200") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_ERRORS "100") +set(CTEST_CUSTOM_MAXIMUM_NUMBER_OF_WARNINGS "100") +set(CTEST_CUSTOM_MAXIMUM_PASSED_TEST_OUTPUT_SIZE "1048576") + +# +# coverage related +# +set(CTEST_CUSTOM_COVERAGE_EXCLUDE ".*tpls/.*;/usr/.*;.*unit_test/.*;.*unit_tests/.*;.*perf_test/.*") + +# +# commands +# +if(NOT "@CHECKOUT_COMMAND@" STREQUAL "") + set(CTEST_CHECKOUT_COMMAND "@CHECKOUT_COMMAND@") +endif() +set(CTEST_UPDATE_COMMAND "@GIT_EXECUTABLE@") +set(CTEST_CONFIGURE_COMMAND "@CMAKE_COMMAND@ -DCMAKE_BUILD_TYPE=@BUILD_TYPE@ -DKokkos_ENABLE_TESTS=ON @CONFIG_ARGS@ @SOURCE_REALDIR@") +set(CTEST_BUILD_COMMAND "@CMAKE_COMMAND@ --build @BINARY_REALDIR@ --target @TARGET@") +if(NOT WIN32) + set(CTEST_BUILD_COMMAND "${CTEST_BUILD_COMMAND} -- -j@BUILD_JOBS@") +endif() +set(CTEST_COVERAGE_COMMAND "gcov") +set(CTEST_MEMORYCHECK_COMMAND "valgrind") +set(CTEST_GIT_COMMAND "@GIT_EXECUTABLE@") + +# +# various configs +# +set(APPEND_VALUE @APPEND@) +if(APPEND_VALUE) + set(APPEND_CTEST APPEND) +endif() + +macro(SET_TEST_PROP VAR) + if(NOT "${ARGS}" STREQUAL "") + set(${VAR}_CTEST ${VAR} ${ARGN}) + endif() +endmacro() + +set_test_prop(START @START@) +set_test_prop(END @END@) +set_test_prop(STRIDE @STRIDE@) +set_test_prop(INCLUDE @INCLUDE@) +set_test_prop(EXCLUDE @EXCLUDE@) +set_test_prop(INCLUDE_LABEL @INCLUDE_LABEL@) +set_test_prop(EXCLUDE_LABEL @EXCLUDE_LABEL@) +set_test_prop(PARALLEL_LEVEL @PARALLEL_LEVEL@) +set_test_prop(STOP_TIME @STOP_TIME@) +set_test_prop(COVERAGE_LABELS @LABELS@) diff --git a/lib/kokkos/cmake/KokkosCI.cmake b/lib/kokkos/cmake/KokkosCI.cmake new file mode 100644 index 0000000000..e8c9af37ad --- /dev/null +++ b/lib/kokkos/cmake/KokkosCI.cmake @@ -0,0 +1,350 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +message(STATUS "") + +get_cmake_property(_cached_vars CACHE_VARIABLES) +set(KOKKOS_CMAKE_ARGS) +set(EXCLUDED_VARIABLES "CMAKE_COMMAND" "CMAKE_CPACK_COMMAND" "CMAKE_CTEST_COMMAND" "CMAKE_ROOT" + "CTEST_ARGS" "BUILD_NAME" "CMAKE_CXX_FLAGS" "CMAKE_BUILD_TYPE") +list(SORT _cached_vars) +foreach(_var ${_cached_vars}) + if(NOT "${_var}" IN_LIST EXCLUDED_VARIABLES) + list(APPEND KOKKOS_CMAKE_ARGS ${_var}) + if("${_var}" STREQUAL "CMAKE_BUILD_TYPE") + set(BUILD_TYPE "${CMAKE_BUILD_TYPE}") + endif() + endif() +endforeach() + + +#----------------------------------------------------------------------------------------# +# +# Macros and variables +# +#----------------------------------------------------------------------------------------# + +macro(CHECK_REQUIRED VAR) + if(NOT DEFINED ${VAR}) + message(FATAL_ERROR "Error! Variable '${VAR}' must be defined") + endif() +endmacro() + +# require the build name variable +CHECK_REQUIRED(BUILD_NAME) + +# uses all args +macro(SET_DEFAULT VAR) + if(NOT DEFINED ${VAR}) + set(${VAR} ${ARGN}) + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# uses first arg -- useful for selecting via priority from multiple +# potentially defined variables, e.g.: +# +# set_default_arg1(BUILD_NAME ${TRAVIS_BUILD_NAME} ${BUILD_NAME}) +# +macro(SET_DEFAULT_ARG1 VAR) + if(NOT DEFINED ${VAR}) + foreach(_ARG ${ARGN}) + if(NOT "${_ARG}" STREQUAL "") + set(${VAR} ${_ARG}) + break() + endif() + endforeach() + endif() + # remove these ctest configuration variables from the defines + # passed to the Kokkos configuration + if("${VAR}" IN_LIST KOKKOS_CMAKE_ARGS) + list(REMOVE_ITEM KOKKOS_CMAKE_ARGS "${VAR}") + endif() +endmacro() + +# determine the default working directory +if(NOT "$ENV{WORKSPACE}" STREQUAL "") + set(WORKING_DIR "$ENV{WORKSPACE}") +else() + get_filename_component(WORKING_DIR ${CMAKE_CURRENT_LIST_DIR} DIRECTORY) +endif() + +# determine the hostname +execute_process(COMMAND hostname + OUTPUT_VARIABLE HOSTNAME + OUTPUT_STRIP_TRAILING_WHITESPACE) + +SET_DEFAULT(HOSTNAME "$ENV{HOSTNAME}") + +# get the number of processors +include(ProcessorCount) +ProcessorCount(NUM_PROCESSORS) + +# find git +find_package(Git QUIET) +if(NOT GIT_EXECUTABLE) + unset(GIT_EXECUTABLE CACHE) + unset(GIT_EXECUTABLE) +endif() + +function(EXECUTE_GIT_COMMAND VAR) + set(${VAR} "" PARENT_SCOPE) + execute_process(COMMAND ${GIT_EXECUTABLE} ${ARGN} + OUTPUT_VARIABLE VAL + RESULT_VARIABLE RET + OUTPUT_STRIP_TRAILING_WHITESPACE + WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} + ERROR_QUIET) + string(REPLACE ";" " " _CMD "${GIT_EXECUTABLE} ${ARGN}") + set(LAST_GIT_COMMAND "${_CMD}" PARENT_SCOPE) + if(RET EQUAL 0) + set(${VAR} "${VAL}" PARENT_SCOPE) + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_BRANCH_NAME VAR) + execute_git_command(GIT_BRANCH branch --show-current) + set(_INVALID "%D" "HEAD") + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH show -s --format=%D) + if(NOT GIT_BRANCH OR "${GIT_BRANCH}" IN_LIST _INVALID) + execute_git_command(GIT_BRANCH --describe all) + endif() + endif() + # + if(GIT_BRANCH) + string(REPLACE " " ";" _DESC "${GIT_BRANCH}") + # just set it to last one via loop instead of wonky cmake index manip + foreach(_ITR ${_DESC}) + set(GIT_BRANCH "${_ITR}") + endforeach() + set(${VAR} "${GIT_BRANCH}" PARENT_SCOPE) + message(STATUS "GIT BRANCH via '${LAST_GIT_COMMAND}': ${GIT_BRANCH}") + endif() +endfunction() + +# just gets the git branch name if available +function(GET_GIT_AUTHOR_NAME VAR) + execute_git_command(GIT_AUTHOR show -s --format=%an) + if(GIT_AUTHOR) + string(LENGTH "${GIT_AUTHOR}" STRLEN) + # if the build name gets too long, this can cause submission errors + if(STRLEN GREATER 24) + # remove middle initial + string(REGEX REPLACE " [A-Z]\. " " " GIT_AUTHOR "${GIT_AUTHOR}") + # get first and sur name + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\1" F_NAME "${GIT_AUTHOR}") + string(REGEX REPLACE "([A-Za-z]+) ([A-Za-z]+)" "\\2" S_NAME "${GIT_AUTHOR}") + if(S_NAME) + set(GIT_AUTHOR "${S_NAME}") + elseif(F_NAME) + set(GIT_AUTHOR "${F_NAME}") + endif() + endif() + # remove any spaces, quotes, periods, etc. + string(REGEX REPLACE "[ ',;_\.\"]+" "" GIT_AUTHOR "${GIT_AUTHOR}") + set(${VAR} "${GIT_AUTHOR}" PARENT_SCOPE) + message(STATUS "GIT AUTHOR via '${LAST_GIT_COMMAND}': ${GIT_AUTHOR}") + endif() +endfunction() + +# get the name of the branch +GET_GIT_BRANCH_NAME(GIT_BRANCH) +# get the name of the author +GET_GIT_AUTHOR_NAME(GIT_AUTHOR) +# author, prefer git method for consistency +SET_DEFAULT_ARG1(AUTHOR ${GIT_AUTHOR} $ENV{GIT_AUTHOR} $ENV{AUTHOR}) +# SLUG == owner_name/repo_name +SET_DEFAULT_ARG1(SLUG $ENV{TRAVIS_PULL_REQUEST_SLUG} $ENV{TRAVIS_REPO_SLUG} $ENV{APPVEYOR_REPO_NAME} $ENV{PULL_REQUEST_SLUG} $ENV{REPO_SLUG}) +# branch name +SET_DEFAULT_ARG1(BRANCH $ENV{TRAVIS_PULL_REQUEST_BRANCH} $ENV{TRAVIS_BRANCH} $ENV{APPVEYOR_PULL_REQUEST_HEAD_REPO_BRANCH} $ENV{APPVEYOR_REPO_BRANCH} $ENV{GIT_BRANCH} $ENV{BRANCH_NAME} $ENV{BRANCH} ${GIT_BRANCH}) +# pull request number +SET_DEFAULT_ARG1(PULL_REQUEST_NUM $ENV{TRAVIS_PULL_REQUEST} $ENV{CHANGE_ID} $ENV{APPVEYOR_PULL_REQUEST_NUMBER} $ENV{PULL_REQUEST_NUM}) +# get the event type, e.g. push, pull_request, api, cron, etc. +SET_DEFAULT_ARG1(EVENT_TYPE $ENV{TRAVIS_EVENT_TYPE} ${EVENT_TYPE}) + +if("${BRANCH}" STREQUAL "") + message(STATUS "Checked: environment variables for Travis, Appveyor, Jenkins (git plugin), BRANCH_NAME, BRANCH and 'git branch --show-current'") + message(FATAL_ERROR "Error! Git branch could not be determined. Please provide -DBRANCH=") +endif() + +#----------------------------------------------------------------------------------------# +# +# Set default values if not provided on command-line +# +#----------------------------------------------------------------------------------------# + +SET_DEFAULT(SOURCE_DIR "${WORKING_DIR}") # source directory +SET_DEFAULT(BINARY_DIR "${WORKING_DIR}/build") # build directory +SET_DEFAULT(BUILD_TYPE "${CMAKE_BUILD_TYPE}") # Release, Debug, etc. +SET_DEFAULT(MODEL "Continuous") # Continuous, Nightly, or Experimental +SET_DEFAULT(JOBS 1) # number of parallel ctests +SET_DEFAULT(CTEST_COMMAND "${CMAKE_CTEST_COMMAND}") # just in case +SET_DEFAULT(CTEST_ARGS "-V --output-on-failure") # extra arguments when ctest is called +SET_DEFAULT(GIT_EXECUTABLE "git") # ctest_update +SET_DEFAULT(TARGET "all") # build target +SET_DEFAULT_ARG1(SITE "$ENV{SITE}" + "${HOSTNAME}") # update site +SET_DEFAULT_ARG1(BUILD_JOBS "$ENV{BUILD_JOBS}" + "${NUM_PROCESSORS}") # number of parallel compile jobs +# +# The variable below correspond to ctest arguments, i.e. START,END,STRIDE are +# '-I START,END,STRIDE' +# +SET_DEFAULT(START "") +SET_DEFAULT(END "") +SET_DEFAULT(STRIDE "") +SET_DEFAULT(INCLUDE "") +SET_DEFAULT(EXCLUDE "") +SET_DEFAULT(INCLUDE_LABEL "") +SET_DEFAULT(EXCLUDE_LABEL "") +SET_DEFAULT(PARALLEL_LEVEL "") +SET_DEFAULT(STOP_TIME "") +SET_DEFAULT(LABELS "") +SET_DEFAULT(NOTES "") + +# default static build tag for Nightly +set(BUILD_TAG "${BRANCH}") + +if(NOT BUILD_TYPE) + # default for kokkos if not specified + set(BUILD_TYPE "RelWithDebInfo") +endif() + +# generate dynamic name if continuous or experimental model +if(NOT "${MODEL}" STREQUAL "Nightly") + if(EVENT_TYPE AND PULL_REQUEST_NUM) + # e.g. pull_request/123 + if(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${EVENT_TYPE}/${PULL_REQUEST_NUM}") + else() + set(BUILD_TAG "${EVENT_TYPE}/${PULL_REQUEST_NUM}") + endif() + elseif(SLUG) + # e.g. owner_name/repo_name + set(BUILD_TAG "${SLUG}") + elseif(AUTHOR) + set(BUILD_TAG "${AUTHOR}/${BRANCH}") + endif() + if(EVENT_TYPE AND NOT PULL_REQUEST_NUM) + set(BUILD_TAG "${BUILD_TAG}-${EVENT_TYPE}") + endif() +endif() + +# unnecessary +string(REPLACE "/remotes/" "/" BUILD_TAG "${BUILD_TAG}") +string(REPLACE "/origin/" "/" BUILD_TAG "${BUILD_TAG}") + +message(STATUS "BUILD_TAG: ${BUILD_TAG}") + +set(BUILD_NAME "[${BUILD_TAG}] [${BUILD_NAME}-${BUILD_TYPE}]") + +# colons in build name create extra (empty) entries in CDash +string(REPLACE ":" "-" BUILD_NAME "${BUILD_NAME}") +# unnecessary info +string(REPLACE "/merge]" "]" BUILD_NAME "${BUILD_NAME}") +# consistency +string(REPLACE "/pr/" "/pull/" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "pull_request/" "pull/" BUILD_NAME "${BUILD_NAME}") +# miscellaneous from missing fields +string(REPLACE "--" "-" BUILD_NAME "${BUILD_NAME}") +string(REPLACE "-]" "]" BUILD_NAME "${BUILD_NAME}") + +# check binary directory +if(EXISTS ${BINARY_DIR}) + if(NOT IS_DIRECTORY "${BINARY_DIR}") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not a directory!") + endif() + file(GLOB BINARY_DIR_FILES "${BINARY_DIR}/*") + if(NOT "${BINARY_DIR_FILES}" STREQUAL "") + message(FATAL_ERROR "Error! '${BINARY_DIR}' already exists and is not empty!") + endif() +endif() + +get_filename_component(SOURCE_REALDIR ${SOURCE_DIR} REALPATH) +get_filename_component(BINARY_REALDIR ${BINARY_DIR} REALPATH) + +#----------------------------------------------------------------------------------------# +# +# Generate the CTestConfig.cmake +# +#----------------------------------------------------------------------------------------# + +set(CONFIG_ARGS) +foreach(_ARG ${KOKKOS_CMAKE_ARGS}) + if(NOT "${${_ARG}}" STREQUAL "") + get_property(_ARG_TYPE CACHE ${_ARG} PROPERTY TYPE) + if("${_ARG_TYPE}" STREQUAL "UNINITIALIZED") + if("${${_ARG}}" STREQUAL "ON" OR "${${_ARG}}" STREQUAL "OFF") + set(_ARG_TYPE "BOOL") + elseif(EXISTS "${${_ARG}}" AND NOT IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "FILEPATH") + elseif(EXISTS "${${_ARG}}" AND IS_DIRECTORY "${${_ARG}}") + set(_ARG_TYPE "PATH") + elseif(NOT "${${_ARG}}" STREQUAL "") + set(_ARG_TYPE "STRING") + endif() + endif() + set(CONFIG_ARGS "${CONFIG_ARGS}set(${_ARG} \"${${_ARG}}\" CACHE ${_ARG_TYPE} \"\")\n") + endif() +endforeach() + +file(WRITE ${BINARY_REALDIR}/initial-cache.cmake +" +set(CMAKE_CXX_FLAGS \"${CMAKE_CXX_FLAGS}\" CACHE STRING \"\") +${CONFIG_ARGS} +") + +file(READ ${BINARY_REALDIR}/initial-cache.cmake _CACHE_INFO) +message(STATUS "Initial cache:\n${_CACHE_INFO}") + +# initialize the cache +set(CONFIG_ARGS "-C ${BINARY_REALDIR}/initial-cache.cmake") + + +# generate the CTestConfig.cmake +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake.in + ${BINARY_REALDIR}/CTestConfig.cmake + @ONLY) + +# copy/generate the dashboard script +configure_file( + ${CMAKE_CURRENT_LIST_DIR}/KokkosCTest.cmake.in + ${BINARY_REALDIR}/KokkosCTest.cmake + @ONLY) + +# custom CTest settings go in ${BINARY_DIR}/CTestCustom.cmake +execute_process( + COMMAND ${CMAKE_COMMAND} -E touch CTestCustom.cmake + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +#----------------------------------------------------------------------------------------# +# +# Execute CTest +# +#----------------------------------------------------------------------------------------# + +message(STATUS "") +message(STATUS "BUILD_NAME: ${BUILD_NAME}") +message(STATUS "Executing '${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS}'...") +message(STATUS "") + +# e.g. -DCTEST_ARGS="--output-on-failure -VV" should really be -DCTEST_ARGS="--output-on-failure;-VV" +string(REPLACE " " ";" CTEST_ARGS "${CTEST_ARGS}") + +execute_process( + COMMAND ${CTEST_COMMAND} -S KokkosCTest.cmake ${CTEST_ARGS} + RESULT_VARIABLE RET + WORKING_DIRECTORY ${BINARY_REALDIR} + ) + +# ensure that any non-zero result variable gets propagated +if(NOT RET EQUAL 0) + message(FATAL_ERROR "CTest return non-zero exit code: ${RET}") +endif() diff --git a/lib/kokkos/cmake/KokkosCTest.cmake.in b/lib/kokkos/cmake/KokkosCTest.cmake.in new file mode 100644 index 0000000000..b6917f3cc1 --- /dev/null +++ b/lib/kokkos/cmake/KokkosCTest.cmake.in @@ -0,0 +1,261 @@ +cmake_minimum_required(VERSION 3.16 FATAL_ERROR) + +if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") + include("${CMAKE_CURRENT_LIST_DIR}/CTestConfig.cmake") +endif() + +include(ProcessorCount) +ProcessorCount(CTEST_PROCESSOR_COUNT) + +cmake_policy(SET CMP0009 NEW) +cmake_policy(SET CMP0011 NEW) + +# ---------------------------------------------------------------------------- # +# -- Commands +# ---------------------------------------------------------------------------- # +find_program(CTEST_CMAKE_COMMAND NAMES cmake) +find_program(CTEST_UNAME_COMMAND NAMES uname) + +find_program(CTEST_BZR_COMMAND NAMES bzr) +find_program(CTEST_CVS_COMMAND NAMES cvs) +find_program(CTEST_GIT_COMMAND NAMES git) +find_program(CTEST_HG_COMMAND NAMES hg) +find_program(CTEST_P4_COMMAND NAMES p4) +find_program(CTEST_SVN_COMMAND NAMES svn) + +find_program(VALGRIND_COMMAND NAMES valgrind) +find_program(GCOV_COMMAND NAMES gcov) +find_program(LCOV_COMMAND NAMES llvm-cov) +find_program(MEMORYCHECK_COMMAND NAMES valgrind ) + +set(MEMORYCHECK_TYPE Valgrind) +# set(MEMORYCHECK_TYPE Purify) +# set(MEMORYCHECK_TYPE BoundsChecker) +# set(MEMORYCHECK_TYPE ThreadSanitizer) +# set(MEMORYCHECK_TYPE AddressSanitizer) +# set(MEMORYCHECK_TYPE LeakSanitizer) +# set(MEMORYCHECK_TYPE MemorySanitizer) +# set(MEMORYCHECK_TYPE UndefinedBehaviorSanitizer) +set(MEMORYCHECK_COMMAND_OPTIONS "--trace-children=yes --leak-check=full") + +# ---------------------------------------------------------------------------- # +# -- Settings +# ---------------------------------------------------------------------------- # +## -- Process timeout in seconds +set(CTEST_TIMEOUT "7200") +## -- Set output to English +set(ENV{LC_MESSAGES} "en_EN" ) + + +# ---------------------------------------------------------------------------- # +# -- Copy ctest configuration file +# ---------------------------------------------------------------------------- # +macro(COPY_CTEST_CONFIG_FILES) + + foreach(_FILE CTestConfig.cmake CTestCustom.cmake) + + # if current directory is not binary or source directory + if(NOT "${CMAKE_CURRENT_LIST_DIR}" STREQUAL "${CTEST_BINARY_DIRECTORY}" AND + NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in current directory + if(EXISTS ${CMAKE_CURRENT_LIST_DIR}/${_FILE}) + configure_file(${CMAKE_CURRENT_LIST_DIR}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + # if source and binary differ + elseif(NOT "${CTEST_SOURCE_DIRECTORY}" STREQUAL "${CTEST_BINARY_DIRECTORY}") + + # if file exists in source directory but not in binary directory + if(EXISTS ${CTEST_SOURCE_DIRECTORY}/${_FILE} AND + NOT EXISTS ${CTEST_BINARY_DIRECTORY}/${_FILE}) + configure_file(${CTEST_SOURCE_DIRECTORY}/${_FILE} + ${CTEST_BINARY_DIRECTORY}/${_FILE} COPYONLY) + endif() + + endif() + endforeach() + +endmacro() + +ctest_read_custom_files("${CMAKE_CURRENT_LIST_DIR}") + +message(STATUS "CTEST_MODEL: ${CTEST_MODEL}") + +#-------------------------------------------------------------------------# +# Start +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running START_CTEST stage...") +message(STATUS "") + +ctest_start(${CTEST_MODEL} TRACK ${CTEST_MODEL} ${APPEND_CTEST} + ${CTEST_SOURCE_DIRECTORY} ${CTEST_BINARY_DIRECTORY}) + + +#-------------------------------------------------------------------------# +# Config +# +copy_ctest_config_files() +ctest_read_custom_files("${CTEST_BINARY_DIRECTORY}") + + +#-------------------------------------------------------------------------# +# Update +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_UPDATE stage...") +message(STATUS "") + +ctest_update(SOURCE "${CTEST_SOURCE_DIRECTORY}" + RETURN_VALUE up_ret) + + +#-------------------------------------------------------------------------# +# Configure +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_CONFIGURE stage...") +message(STATUS "") + +ctest_configure(BUILD "${CTEST_BINARY_DIRECTORY}" + SOURCE ${CTEST_SOURCE_DIRECTORY} + ${APPEND_CTEST} + OPTIONS "${CTEST_CONFIGURE_OPTIONS}" + RETURN_VALUE config_ret) + + +#-------------------------------------------------------------------------# +# Echo configure log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _configure_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastConfigure*.log") +# should only have one but loop just for safety +foreach(_LOG ${_configure_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Configure Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Build +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_BUILD stage...") +message(STATUS "") + +ctest_build(BUILD "${CTEST_BINARY_DIRECTORY}" + ${APPEND_CTEST} + RETURN_VALUE build_ret) + + +#-------------------------------------------------------------------------# +# Echo build log bc Damien wants to delay merging this PR for eternity +# +file(GLOB _build_log "${CTEST_BINARY_DIRECTORY}/Testing/Temporary/LastBuild*.log") +# should only have one but loop just for safety +foreach(_LOG ${_build_log}) + file(READ ${_LOG} _LOG_MESSAGE) + message(STATUS "Build Log: ${_LOG}") + message(STATUS "\n${_LOG_MESSAGE}\n") +endforeach() + + +#-------------------------------------------------------------------------# +# Test +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_TEST stage...") +message(STATUS "") + +ctest_test(RETURN_VALUE test_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST} + ${STOP_TIME_CTEST} + SCHEDULE_RANDOM OFF) + + +#-------------------------------------------------------------------------# +# Coverage +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_COVERAGE stage...") +message(STATUS "") + +execute_process(COMMAND ${CTEST_COVERAGE_COMMAND} ${CTEST_COVERAGE_EXTRA_FLAGS} + WORKING_DIRECTORY ${CTEST_BINARY_DIRECTORY} + ERROR_QUIET) + +ctest_coverage(${APPEND_CTEST} + ${CTEST_COVERAGE_LABELS} + RETURN_VALUE cov_ret) + + +#-------------------------------------------------------------------------# +# MemCheck +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_MEMCHECK stage...") +message(STATUS "") + +ctest_memcheck(RETURN_VALUE mem_ret + ${APPEND_CTEST} + ${START_CTEST} + ${END_CTEST} + ${STRIDE_CTEST} + ${INCLUDE_CTEST} + ${EXCLUDE_CTEST} + ${INCLUDE_LABEL_CTEST} + ${EXCLUDE_LABEL_CTEST} + ${PARALLEL_LEVEL_CTEST}) + + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Running CTEST_SUBMIT stage...") +message(STATUS "") + +file(GLOB_RECURSE NOTE_FILES "${CTEST_BINARY_DIRECTORY}/*CTestNotes.cmake") +foreach(_FILE ${NOTE_FILES}) + message(STATUS "Including CTest notes files: \"${_FILE}\"...") + include("${_FILE}") +endforeach() + +# capture submit error so it doesn't fail because of a submission error +ctest_submit(RETURN_VALUE submit_ret + RETRY_COUNT 2 + RETRY_DELAY 10 + CAPTURE_CMAKE_ERROR submit_err) + +#-------------------------------------------------------------------------# +# Submit +# +message(STATUS "") +message(STATUS "[${CTEST_BUILD_NAME}] Finished ${CTEST_MODEL} Stages (${STAGES})") +message(STATUS "") + + +#-------------------------------------------------------------------------# +# Non-zero exit codes for important errors +# +if(NOT config_ret EQUAL 0) + message(FATAL_ERROR "Error during configuration! Exit code: ${config_ret}") +endif() + +if(NOT build_ret EQUAL 0) + message(FATAL_ERROR "Error during build! Exit code: ${build_ret}") +endif() + +if(NOT test_ret EQUAL 0) + message(FATAL_ERROR "Error during testing! Exit code: ${test_ret}") +endif() diff --git a/lib/kokkos/cmake/KokkosConfig.cmake.in b/lib/kokkos/cmake/KokkosConfig.cmake.in index 9fbd22ee5c..44a8fcd9c3 100644 --- a/lib/kokkos/cmake/KokkosConfig.cmake.in +++ b/lib/kokkos/cmake/KokkosConfig.cmake.in @@ -19,17 +19,44 @@ INCLUDE("${Kokkos_CMAKE_DIR}/KokkosTargets.cmake") INCLUDE("${Kokkos_CMAKE_DIR}/KokkosConfigCommon.cmake") UNSET(Kokkos_CMAKE_DIR) -# if CUDA was enabled and separable compilation was specified, e.g. -# find_package(Kokkos COMPONENTS separable_compilation) -# then we set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK -IF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) +# check for conflicts +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS AND + "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + MESSAGE(STATUS "'launch_compiler' implies global redirection of targets depending on Kokkos to appropriate compiler.") + MESSAGE(STATUS "'separable_compilation' implies explicitly defining where redirection occurs via 'kokkos_compilation(PROJECT|TARGET|SOURCE|DIRECTORY ...)'") + MESSAGE(FATAL_ERROR "Conflicting COMPONENTS: 'launch_compiler' and 'separable_compilation'") +ENDIF() + +IF("launch_compiler" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if find_package(Kokkos COMPONENTS launch_compiler) then rely on the + # RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK to always redirect to the + # appropriate compiler for Kokkos + # + + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") + kokkos_compilation( + GLOBAL + CHECK_CUDA_COMPILES) + +ELSEIF(@Kokkos_ENABLE_CUDA@ AND NOT "separable_compilation" IN_LIST Kokkos_FIND_COMPONENTS) + # + # if CUDA was enabled, separable compilation was not specified, and current compiler + # cannot compile CUDA, then set the RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK globally and + # kokkos_launch_compiler will re-direct to the compiler used to compile CUDA code during installation. + # kokkos_launch_compiler will re-direct if ${CMAKE_CXX_COMPILER} and -DKOKKOS_DEPENDENCE is present, + # otherwise, the original command will be executed + # + # run test to see if CMAKE_CXX_COMPILER=nvcc_wrapper kokkos_compiler_is_nvcc(IS_NVCC ${CMAKE_CXX_COMPILER}) - # if not nvcc_wrapper, use RULE_LAUNCH_COMPILE and RULE_LAUNCH_LINK - IF(NOT IS_NVCC AND NOT CMAKE_CXX_COMPILER_ID STREQUAL Clang AND - (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) - MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to nvcc_wrapper") + + # if not nvcc_wrapper and Kokkos_LAUNCH_COMPILER was not set to OFF + IF(NOT IS_NVCC AND (NOT DEFINED Kokkos_LAUNCH_COMPILER OR Kokkos_LAUNCH_COMPILER)) + MESSAGE(STATUS "kokkos_launch_compiler is enabled globally. C++ compiler commands with -DKOKKOS_DEPENDENCE will be redirected to the appropriate compiler for Kokkos") kokkos_compilation(GLOBAL) ENDIF() - UNSET(IS_NVCC) # be mindful of the environment, pollution is bad + + # be mindful of the environment, pollution is bad + UNSET(IS_NVCC) ENDIF() diff --git a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in index 42c755c215..ab93e65afe 100644 --- a/lib/kokkos/cmake/KokkosConfigCommon.cmake.in +++ b/lib/kokkos/cmake/KokkosConfigCommon.cmake.in @@ -3,6 +3,7 @@ SET(Kokkos_OPTIONS @KOKKOS_ENABLED_OPTIONS@) SET(Kokkos_TPLS @KOKKOS_ENABLED_TPLS@) SET(Kokkos_ARCH @KOKKOS_ENABLED_ARCH_LIST@) SET(Kokkos_CXX_COMPILER "@CMAKE_CXX_COMPILER@") +SET(Kokkos_CXX_COMPILER_ID "@KOKKOS_CXX_COMPILER_ID@") # These are needed by KokkosKernels FOREACH(DEV ${Kokkos_DEVICES}) @@ -13,13 +14,13 @@ IF(NOT Kokkos_FIND_QUIETLY) MESSAGE(STATUS "Enabled Kokkos devices: ${Kokkos_DEVICES}") ENDIF() -IF (Kokkos_ENABLE_CUDA AND ${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.14.0") - #If we are building CUDA, we have tricked CMake because we declare a CXX project - #If the default C++ standard for a given compiler matches the requested - #standard, then CMake just omits the -std flag in later versions of CMake - #This breaks CUDA compilation (CUDA compiler can have a different default - #-std then the underlying host compiler by itself). Setting this variable - #forces CMake to always add the -std flag even if it thinks it doesn't need it +IF (Kokkos_ENABLE_CUDA) + # If we are building CUDA, we have tricked CMake because we declare a CXX project + # If the default C++ standard for a given compiler matches the requested + # standard, then CMake just omits the -std flag in later versions of CMake + # This breaks CUDA compilation (CUDA compiler can have a different default + # -std then the underlying host compiler by itself). Setting this variable + # forces CMake to always add the -std flag even if it thinks it doesn't need it SET(CMAKE_CXX_STANDARD_DEFAULT 98 CACHE INTERNAL "" FORCE) ENDIF() @@ -90,52 +91,6 @@ function(kokkos_check) endif() endfunction() -# this function is provided to easily select which files use nvcc_wrapper: -# -# GLOBAL --> all files -# TARGET --> all files in a target -# SOURCE --> specific source files -# DIRECTORY --> all files in directory -# PROJECT --> all files/targets in a project/subproject -# -FUNCTION(kokkos_compilation) - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - - # search relative first and then absolute - SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") - - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${_HINTS} - PATHS ${_HINTS} - PATH_SUFFIXES bin) - - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() - - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() - # A test to check whether a downstream project set the C++ compiler to NVCC or not # this is called only when Kokkos was installed with Kokkos_ENABLE_CUDA=ON FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) @@ -159,3 +114,161 @@ FUNCTION(kokkos_compiler_is_nvcc VAR COMPILER) ENDIF() ENDFUNCTION() +# this function checks whether the current CXX compiler supports building CUDA +FUNCTION(kokkos_cxx_compiler_cuda_test _VAR _COMPILER) + + FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu +" +#include +#include + +__global__ +void kernel(int sz, double* data) +{ + int _beg = blockIdx.x * blockDim.x + threadIdx.x; + for(int i = _beg; i < sz; ++i) + data[i] += static_cast(i); +} + +int main() +{ + double* data = NULL; + int blocks = 64; + int grids = 64; + int ret = cudaMalloc(&data, blocks * grids * sizeof(double)); + if(ret != cudaSuccess) + return EXIT_FAILURE; + kernel<<>>(blocks * grids, data); + cudaDeviceSynchronize(); + return EXIT_SUCCESS; +} +") + + # save the command for debugging + SET(_COMMANDS "${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + + # use execute_process instead of try compile because we want to set custom compiler + EXECUTE_PROCESS(COMMAND ${_COMPILER} ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + + IF(NOT _RET EQUAL 0) + # save the command for debugging + SET(_COMMANDS "${_COMMAND}\n${_COMPILER} --cuda-gpu-arch=sm_35 ${ARGN} -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu") + # try the compile test again with clang arguments + EXECUTE_PROCESS(COMMAND ${_COMPILER} --cuda-gpu-arch=sm_35 -c ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cu + RESULT_VARIABLE _RET + WORKING_DIRECTORY ${PROJECT_BINARY_DIR}/compile_tests + TIMEOUT 15 + OUTPUT_QUIET + ERROR_QUIET) + ENDIF() + + SET(${_VAR}_COMMANDS "${_COMMANDS}" PARENT_SCOPE) + SET(${_VAR} ${_RET} PARENT_SCOPE) +ENDFUNCTION() + +# this function is provided to easily select which files use the same compiler as Kokkos +# when it was installed (or nvcc_wrapper): +# +# GLOBAL --> all files +# TARGET --> all files in a target +# SOURCE --> specific source files +# DIRECTORY --> all files in directory +# PROJECT --> all files/targets in a project/subproject +# +# Use the COMPILER argument to specify a compiler, if needed. By default, it will +# set the values to ${Kokkos_CXX_COMPILER} unless Kokkos_ENABLE_CUDA=ON and +# Kokkos_CXX_COMPILER_ID is NVIDIA, then it will set it to nvcc_wrapper +# +# Use CHECK_CUDA_COMPILES to run a check when CUDA is enabled +# +FUNCTION(kokkos_compilation) + CMAKE_PARSE_ARGUMENTS(COMP + "GLOBAL;PROJECT;CHECK_CUDA_COMPILES" + "COMPILER" + "DIRECTORY;TARGET;SOURCE;COMMAND_PREFIX" + ${ARGN}) + + # if built w/o CUDA support, we want to basically make this a no-op + SET(_Kokkos_ENABLE_CUDA @Kokkos_ENABLE_CUDA@) + + # search relative first and then absolute + SET(_HINTS "${CMAKE_CURRENT_LIST_DIR}/../.." "@CMAKE_INSTALL_PREFIX@") + + # find kokkos_launch_compiler + FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") + ENDIF() + + # if COMPILER was not specified, assume Kokkos_CXX_COMPILER + IF(NOT COMP_COMPILER) + SET(COMP_COMPILER ${Kokkos_CXX_COMPILER}) + IF(_Kokkos_ENABLE_CUDA AND Kokkos_CXX_COMPILER_ID STREQUAL NVIDIA) + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${_HINTS} + PATHS ${_HINTS} + PATH_SUFFIXES bin) + # fatal if we can't nvcc_wrapper + IF(NOT Kokkos_NVCC_WRAPPER) + MESSAGE(FATAL_ERROR "Kokkos could not find nvcc_wrapper. Please set '-DKokkos_NVCC_WRAPPER=/path/to/nvcc_wrapper'") + ENDIF() + SET(COMP_COMPILER ${Kokkos_NVCC_WRAPPER}) + ENDIF() + ENDIF() + + # check that the original compiler still exists! + IF(NOT EXISTS ${COMP_COMPILER}) + MESSAGE(FATAL_ERROR "Kokkos could not find original compiler: '${COMP_COMPILER}'") + ENDIF() + + # try to ensure that compiling cuda code works! + IF(_Kokkos_ENABLE_CUDA AND COMP_CHECK_CUDA_COMPILES) + + # this may fail if kokkos_compiler launcher was used during install + kokkos_cxx_compiler_cuda_test(_COMPILES_CUDA + ${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}) + + # if above failed, throw an error + IF(NOT _COMPILES_CUDA) + MESSAGE(FATAL_ERROR "kokkos_cxx_compiler_cuda_test failed! Test commands:\n${_COMPILES_CUDA_COMMANDS}") + ENDIF() + ENDIF() + + IF(COMP_COMMAND_PREFIX) + SET(_PREFIX "${COMP_COMMAND_PREFIX}") + STRING(REPLACE ";" " " _PREFIX "${COMP_COMMAND_PREFIX}") + SET(Kokkos_COMPILER_LAUNCHER "${_PREFIX} ${Kokkos_COMPILE_LAUNCHER}") + ENDIF() + + IF(COMP_GLOBAL) + # if global, don't bother setting others + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ELSE() + FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + UNSET(COMP_${_TYPE}) + ENDIF() + # set the properties if defined + IF(COMP_${_TYPE}) + # MESSAGE(STATUS "Using ${COMP_COMPILER} :: ${_TYPE} :: ${COMP_${_TYPE}}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${COMP_COMPILER} ${CMAKE_CXX_COMPILER}") + ENDIF() + ENDFOREACH() + ENDIF() +ENDFUNCTION() diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 0259fe69d5..fbfae3711e 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -78,6 +78,7 @@ #cmakedefine KOKKOS_ARCH_POWER7 #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 +#cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_KEPLER #cmakedefine KOKKOS_ARCH_KEPLER30 #cmakedefine KOKKOS_ARCH_KEPLER32 @@ -95,5 +96,8 @@ #cmakedefine KOKKOS_ARCH_VOLTA72 #cmakedefine KOKKOS_ARCH_TURING75 #cmakedefine KOKKOS_ARCH_AMPERE80 +#cmakedefine KOKKOS_ARCH_AMPERE86 #cmakedefine KOKKOS_ARCH_AMD_ZEN #cmakedefine KOKKOS_ARCH_AMD_ZEN2 + +#cmakedefine KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF diff --git a/lib/kokkos/cmake/Modules/CudaToolkit.cmake b/lib/kokkos/cmake/Modules/CudaToolkit.cmake index d620a71d36..eda5541f7c 100644 --- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake @@ -481,76 +481,6 @@ if(CMAKE_CUDA_COMPILER_LOADED AND NOT CUDAToolkit_BIN_DIR AND CMAKE_CUDA_COMPILE unset(cuda_dir) endif() -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_link_libraries target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LIBS) - list(APPEND LIBS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(LIBS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LIBS}") - endfunction() -ELSE() - function(import_target_link_libraries) - target_link_libraries(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.13.0") - function(import_target_link_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(LINK_LIBS ${target} INTERFACE_LINK_LIBRARIES) - if (LINK_LIBS) #could be not-found - set(LINK_LIBS_LIST ${LINK_LIBS}) - endif() - foreach(LIB ${HACK_UNPARSED_ARGUMENTS}) - list(APPEND LINK_LIBS_LIST -L${LIB}) - endforeach() - set_target_properties(${target} PROPERTIES - INTERFACE_LINK_LIBRARIES "${LINK_LIBS_LIST}") - endfunction() -ELSE() - function(import_target_link_directories) - target_link_directories(${ARGN}) - endfunction() -ENDIF() - -IF(CMAKE_VERSION VERSION_LESS "3.12.0") - function(import_target_include_directories target) - cmake_parse_arguments(HACK - "SYSTEM;INTERFACE;PUBLIC" - "" - "" - ${ARGN} - ) - get_target_property(INLUDE_DIRS ${target} INTERFACE_INCLUDE_DIRECTORIES) - if (INCLUDE_DIRS) - list(APPEND INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - else() - set(INCLUDE_DIRS ${HACK_UNPARSED_ARGUMENTS}) - endif() - set_target_properties(${target} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${INCLUDE_DIRS}") - endfunction() -ELSE() - function(import_target_include_directories) - target_include_directories(${ARGN}) - endfunction() -ENDIF() - # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) find_program(CUDAToolkit_NVCC_EXECUTABLE @@ -854,11 +784,11 @@ if(CUDAToolkit_FOUND) if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) - import_target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") + target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") foreach(dep ${arg_DEPS}) if(TARGET CUDA::${dep}) - import_target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) + target_link_libraries(CUDA::${lib_name} INTERFACE CUDA::${dep}) endif() endforeach() endif() @@ -866,8 +796,8 @@ if(CUDAToolkit_FOUND) if(NOT TARGET CUDA::toolkit) add_library(CUDA::toolkit IMPORTED INTERFACE) - import_target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") - import_target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") + target_include_directories(CUDA::toolkit SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") + target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) @@ -882,11 +812,11 @@ if(CUDAToolkit_FOUND) AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) - import_target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) + target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) if(UNIX AND (CMAKE_C_COMPILER OR CMAKE_CXX_COMPILER)) find_package(Threads REQUIRED) - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE Threads::Threads ${CMAKE_DL_LIBS}) endif() if(UNIX AND NOT APPLE) @@ -896,7 +826,7 @@ if(CUDAToolkit_FOUND) if(NOT CUDAToolkit_rt_LIBRARY) message(WARNING "Could not find librt library, needed by CUDA::cudart_static") else() - import_target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) + target_link_libraries(CUDA::cudart_static_deps INTERFACE ${CUDAToolkit_rt_LIBRARY}) endif() endif() endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index a1072a60c6..8d58d96415 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -25,7 +25,7 @@ IF (TARGET CUDA::cuda_driver) SET(FOUND_CUDA_DRIVER TRUE) KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) ELSE() - SET(FOUND_CUDA_DRIVVER FALSE) + SET(FOUND_CUDA_DRIVER FALSE) ENDIF() include(FindPackageHandleStandardArgs) diff --git a/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake b/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake index 1d154e29af..a743fca0e4 100644 --- a/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLPTHREAD.cmake @@ -10,7 +10,7 @@ TRY_COMPILE(KOKKOS_HAS_PTHREAD_ARG # ${CMAKE_CXX${KOKKOS_CXX_STANDARD}_STANDARD_COMPILE_OPTION} INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE_HANDLE_STANDARD_ARGS(PTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) +FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLPTHREAD DEFAULT_MSG KOKKOS_HAS_PTHREAD_ARG) #Only create the TPL if we succeed IF (KOKKOS_HAS_PTHREAD_ARG) KOKKOS_CREATE_IMPORTED_TPL(PTHREAD diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake new file mode 100644 index 0000000000..512ad6ceb2 --- /dev/null +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -0,0 +1,11 @@ +include(FindPackageHandleStandardArgs) + +FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) + +find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY) + +kokkos_create_imported_tpl(ROCM INTERFACE + LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} + COMPILE_DEFINITIONS __HIP_ROCclr__ +) diff --git a/lib/kokkos/cmake/compile_tests/cplusplus14.cpp b/lib/kokkos/cmake/compile_tests/cplusplus14.cpp new file mode 100644 index 0000000000..52ec9885ec --- /dev/null +++ b/lib/kokkos/cmake/compile_tests/cplusplus14.cpp @@ -0,0 +1,8 @@ +#include + +int main() { + // _t versions of type traits were added in C++14 + std::remove_cv_t i = 0; + + return i; +} diff --git a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc index 48c01c070c..a26ac5af4b 100644 --- a/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc +++ b/lib/kokkos/cmake/compile_tests/cuda_compute_capability.cc @@ -72,6 +72,7 @@ int main() { case 72: std::cout << "Set -DKokkos_ARCH_VOLTA72=ON ." << std::endl; break; case 75: std::cout << "Set -DKokkos_ARCH_TURING75=ON ." << std::endl; break; case 80: std::cout << "Set -DKokkos_ARCH_AMPERE80=ON ." << std::endl; break; + case 86: std::cout << "Set -DKokkos_ARCH_AMPERE86=ON ." << std::endl; break; default: std::cout << "Compute capability " << compute_capability << " is not supported" << std::endl; diff --git a/lib/kokkos/cmake/compile_tests/pthread.cpp b/lib/kokkos/cmake/compile_tests/pthread.cpp index 92310da029..3f83bf6a5f 100644 --- a/lib/kokkos/cmake/compile_tests/pthread.cpp +++ b/lib/kokkos/cmake/compile_tests/pthread.cpp @@ -2,7 +2,7 @@ void* kokkos_test(void* args) { return args; } -int main(void) { +int main() { pthread_t thread; /* Use NULL to avoid C++11. Some compilers do not have C++11 by default. Forcing C++11 diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index 2e82a46235..fbd6745a60 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -81,10 +81,16 @@ ENDMACRO() FUNCTION(KOKKOS_ADD_TEST) if (KOKKOS_HAS_TRILINOS) CMAKE_PARSE_ARGUMENTS(TEST - "" + "SKIP_TRIBITS" "EXE;NAME;TOOL" "ARGS" ${ARGN}) + + IF(TEST_SKIP_TRIBITS) + MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") + RETURN() + ENDIF() + IF(TEST_EXE) SET(EXE_ROOT ${TEST_EXE}) ELSE() @@ -119,11 +125,10 @@ FUNCTION(KOKKOS_ADD_TEST) endif() else() CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL" + "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" ${ARGN}) - SET(TESTS_ADDED) # To match Tribits, we should always be receiving # the root names of exes/libs IF(TEST_EXE) @@ -135,48 +140,27 @@ FUNCTION(KOKKOS_ADD_TEST) # These should be the full target name SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF (TEST_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${TEST_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${ARG_STR_LIST}) - ELSE() - ADD_TEST(NAME ${TEST_NAME}${TEST_NUMBER} COMMAND ${EXE} ${ARG_STR_LIST}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - ENDFOREACH() + IF(WIN32) + ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} + COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) ELSE() - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE}) - ENDIF() - LIST(APPEND TESTS_ADDED "${TEST_NAME}") + ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) + ENDIF() + IF(TEST_WILL_FAIL) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + ENDIF() + IF(TEST_FAIL_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_PASS_REGULAR_EXPRESSION) + SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + ENDIF() + IF(TEST_TOOL) + ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") ENDIF() - - FOREACH(TEST_NAME ${TESTS_ADDED}) - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - set_property(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - endif() - ENDFOREACH() VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - endif() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_ADD_ADVANCED_TEST) @@ -326,14 +310,6 @@ ENDIF() ENDFUNCTION() -FUNCTION(KOKKOS_TARGET_COMPILE_DEFINITIONS) - IF (KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ELSE() - TARGET_COMPILE_DEFINITIONS(${TARGET} ${ARGN}) - ENDIF() -ENDFUNCTION() - FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) IF(KOKKOS_HAS_TRILINOS) TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) @@ -350,10 +326,6 @@ ENDIF() ENDFUNCTION() -MACRO(KOKKOS_ADD_COMPILE_OPTIONS) -ADD_COMPILE_OPTIONS(${ARGN}) -ENDMACRO() - MACRO(PRINTALL match) get_cmake_property(_variableNames VARIABLES) list (SORT _variableNames) @@ -376,4 +348,3 @@ FUNCTION(GLOBAL_APPEND VARNAME) LIST(APPEND TEMP ${ARGN}) GLOBAL_SET(${VARNAME} ${TEMP}) ENDFUNCTION() - diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 53aaf7dccf..ec18e70a36 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -35,7 +35,7 @@ KOKKOS_ARCH_OPTION(ARMV80 HOST "ARMv8.0 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV81 HOST "ARMv8.1 Compatible CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX HOST "ARMv8 Cavium ThunderX CPU") KOKKOS_ARCH_OPTION(ARMV8_THUNDERX2 HOST "ARMv8 Cavium ThunderX2 CPU") -KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Suport") +KOKKOS_ARCH_OPTION(A64FX HOST "ARMv8.2 with SVE Support") KOKKOS_ARCH_OPTION(WSM HOST "Intel Westmere CPU") KOKKOS_ARCH_OPTION(SNB HOST "Intel Sandy/Ivy Bridge CPUs") KOKKOS_ARCH_OPTION(HSW HOST "Intel Haswell CPUs") @@ -60,11 +60,12 @@ KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0") KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2") KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5") KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0") +KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6") KOKKOS_ARCH_OPTION(ZEN HOST "AMD Zen architecture") KOKKOS_ARCH_OPTION(ZEN2 HOST "AMD Zen2 architecture") KOKKOS_ARCH_OPTION(VEGA900 GPU "AMD GPU MI25 GFX900") KOKKOS_ARCH_OPTION(VEGA906 GPU "AMD GPU MI50/MI60 GFX906") -KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU") +KOKKOS_ARCH_OPTION(VEGA908 GPU "AMD GPU MI100 GFX908") KOKKOS_ARCH_OPTION(INTEL_GEN GPU "Intel GPUs Gen9+") @@ -141,8 +142,16 @@ ENDIF() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- #clear anything that might be in the cache GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) - SET(AMDGPU_ARCH_FLAG "--amdgpu-target") +IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(AMDGPU_ARCH_FLAG "--amdgpu-target") + ELSE() + SET(AMDGPU_ARCH_FLAG "--offload-arch") + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -x hip) + IF(DEFINED ENV{ROCM_PATH}) + GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + ENDIF() + ENDIF() ENDIF() @@ -183,6 +192,8 @@ ENDIF() IF (KOKKOS_ARCH_A64FX) COMPILER_SPECIFIC_FLAGS( DEFAULT -march=armv8.2-a+sve + Clang -march=armv8.2-a+sve -msve-vector-bits=512 + GCC -march=armv8.2-a+sve -msve-vector-bits=512 ) ENDIF() @@ -309,7 +320,7 @@ IF (KOKKOS_ARCH_POWER8 OR KOKKOS_ARCH_POWER9) SET(KOKKOS_USE_ISA_POWERPCLE ON) ENDIF() -IF (Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( Clang -fcuda-rdc NVIDIA --relocatable-device-code=true @@ -333,8 +344,8 @@ ENDIF() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (Kokkos_ENABLE_HIP) - IF (Kokkos_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) +IF (KOKKOS_ENABLE_HIP) + IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) COMPILER_SPECIFIC_FLAGS( DEFAULT -fgpu-rdc ) @@ -345,8 +356,7 @@ IF (Kokkos_ENABLE_HIP) ENDIF() ENDIF() - -IF (Kokkos_ENABLE_SYCL) +IF (KOKKOS_ENABLE_SYCL) COMPILER_SPECIFIC_FLAGS( DEFAULT -fsycl ) @@ -363,7 +373,7 @@ FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") ENDIF() SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET) + IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL) MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() @@ -396,6 +406,7 @@ CHECK_CUDA_ARCH(VOLTA70 sm_70) CHECK_CUDA_ARCH(VOLTA72 sm_72) CHECK_CUDA_ARCH(TURING75 sm_75) CHECK_CUDA_ARCH(AMPERE80 sm_80) +CHECK_CUDA_ARCH(AMPERE86 sm_86) SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) @@ -405,12 +416,12 @@ FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) ENDIF() SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET) - MESSAGE(WARNING "Given HIP arch ${ARCH}, but Kokkos_ENABLE_AMDGPU and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") + MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) ELSE() SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_HIP) + IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") ENDIF() ENDIF() @@ -451,6 +462,24 @@ IF (KOKKOS_ENABLE_OPENMPTARGET) ENDIF() ENDIF() +IF (KOKKOS_ENABLE_SYCL) + IF(CUDA_ARCH_ALREADY_SPECIFIED) + IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda-sycldevice + ) + # FIXME_SYCL The CUDA backend doesn't support printf yet. + GLOBAL_SET(KOKKOS_IMPL_DISABLE_SYCL_DEVICE_PRINTF ON) + ELSE() + MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") + ENDIF() + ELSEIF(KOKKOS_ARCH_INTEL_GEN) + COMPILER_SPECIFIC_FLAGS( + DEFAULT -fsycl-targets=spir64_gen-unknown-unknown-sycldevice -Xsycl-target-backend "-device skl" + ) + ENDIF() +ENDIF() + IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) @@ -464,6 +493,43 @@ IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + + # if user is using kokkos_compiler_launcher, above will fail. + IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) + GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough + IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + # make sure the user knows that we aren't using CUDA compiler for anything else + MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") + INCLUDE(CheckLanguage) + CHECK_LANGUAGE(CUDA) + IF(CMAKE_CUDA_COMPILER) + ENABLE_LANGUAGE(CUDA) + ELSE() + MESSAGE(STATUS "CUDA language could not be enabled") + ENDIF() + ENDIF() + + # if CUDA was enabled, this will be defined + IF(CMAKE_CUDA_COMPILER) + # copy our test to .cu so cmake compiles as CUDA + CONFIGURE_FILE( + ${PROJECT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COPYONLY + ) + # run test again + TRY_RUN( + _RESULT + _COMPILE_RESULT + ${_BINARY_TEST_DIR} + ${PROJECT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu + COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + ENDIF() + ENDIF() + LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") @@ -500,7 +566,7 @@ IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_ARCH_VOLTA ON) ENDIF() - IF (KOKKOS_ARCH_AMPERE80) + IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) SET(KOKKOS_ARCH_AMPERE ON) ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index e6600161f9..4434d6928f 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -27,6 +27,12 @@ IF(Kokkos_ENABLE_CUDA) PATHS ${PROJECT_SOURCE_DIR} PATH_SUFFIXES bin) + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + # check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # if launcher was found and nvcc_wrapper was not specified as @@ -37,7 +43,7 @@ IF(Kokkos_ENABLE_CUDA) # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) SET(INTERNAL_USE_COMPILER_LAUNCHER true) ENDIF() ENDIF() @@ -55,32 +61,7 @@ IF(INTERNAL_HAVE_COMPILER_NVCC) SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") IF(INTERNAL_USE_COMPILER_LAUNCHER) - IF(Kokkos_LAUNCH_COMPILER_INFO) - GET_FILENAME_COMPONENT(BASE_COMPILER_NAME ${CMAKE_CXX_COMPILER} NAME) - # does not have STATUS intentionally - MESSAGE("") - MESSAGE("Kokkos_LAUNCH_COMPILER_INFO (${Kokkos_COMPILE_LAUNCHER}):") - MESSAGE(" - Kokkos + CUDA backend requires the C++ files to be compiled as CUDA code.") - MESSAGE(" - kokkos_launch_compiler permits CMAKE_CXX_COMPILER to be set to a traditional C++ compiler when Kokkos_ENABLE_CUDA=ON") - MESSAGE(" by prefixing all the compile and link commands with the path to the script + CMAKE_CXX_COMPILER (${CMAKE_CXX_COMPILER}).") - MESSAGE(" - If any of the compile or link commands have CMAKE_CXX_COMPILER as the first argument, it replaces CMAKE_CXX_COMPILER with nvcc_wrapper.") - MESSAGE(" - If the compile or link command is not CMAKE_CXX_COMPILER, it just executes the command.") - MESSAGE(" - If using ccache, set CMAKE_CXX_COMPILER to nvcc_wrapper explicitly.") - MESSAGE(" - kokkos_compiler_launcher is available to downstream projects as well.") - MESSAGE(" - If CMAKE_CXX_COMPILER=nvcc_wrapper, all legacy behavior will be preserved during 'find_package(Kokkos)'") - MESSAGE(" - If CMAKE_CXX_COMPILER is not nvcc_wrapper, 'find_package(Kokkos)' will apply 'kokkos_compilation(GLOBAL)' unless separable compilation is enabled") - MESSAGE(" - This can be disabled via '-DKokkos_LAUNCH_COMPILER=OFF'") - MESSAGE(" - Use 'find_package(Kokkos COMPONENTS separable_compilation)' to enable separable compilation") - MESSAGE(" - Separable compilation allows you to control the scope of where the compiler transformation behavior (${BASE_COMPILER_NAME} -> nvcc_wrapper) is applied") - MESSAGE(" - The compiler transformation can be applied on a per-project, per-directory, per-target, and/or per-source-file basis") - MESSAGE(" - 'kokkos_compilation(PROJECT)' will apply the compiler transformation to all targets in a project/subproject") - MESSAGE(" - 'kokkos_compilation(TARGET [...])' will apply the compiler transformation to the specified target(s)") - MESSAGE(" - 'kokkos_compilation(SOURCE [...])' will apply the compiler transformation to the specified source file(s)") - MESSAGE(" - 'kokkos_compilation(DIRECTORY [...])' will apply the compiler transformation to the specified directories") - MESSAGE("") - ELSE() - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled... Set Kokkos_LAUNCH_COMPILER_INFO=ON for more info.") - ENDIF() + MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) ENDIF() ENDIF() @@ -92,7 +73,11 @@ IF(Kokkos_ENABLE_HIP) OUTPUT_STRIP_TRAILING_WHITESPACE) STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - SET(KOKKOS_CXX_COMPILER_ID HIP CACHE STRING INTERNAL FORCE) + + STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + ENDIF() STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) @@ -103,8 +88,7 @@ ENDIF() IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep Cray - COMMAND wc -l + COMMAND grep -c Cray OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang @@ -112,8 +96,7 @@ IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) ENDIF() # The clang based Intel compiler reports as Clang to most versions of CMake EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep icpx - COMMAND wc -l + COMMAND grep -c "DPC++\\|icpx" OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER OUTPUT_STRIP_TRAILING_WHITESPACE) IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang @@ -174,7 +157,7 @@ ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIP) +ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 3.8.0) MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") ENDIF() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index 3962c4b16e..a84ac2b630 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -49,11 +49,14 @@ ENDIF() IF (KOKKOS_CXX_STANDARD STREQUAL 17) IF (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 7) - MESSAGE(FATAL_ERROR "You have requested c++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC <= 6 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for GCC ${KOKKOS_CXX_COMPILER_VERSION}. Although CMake has allowed this and GCC accepts -std=c++1z/c++17, GCC < 7 does not properly support *this capture. Please reduce the C++ standard to 14 or upgrade the compiler if you do need C++17 support.") ENDIF() IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11) - MESSAGE(FATAL_ERROR "You have requested c++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + MESSAGE(FATAL_ERROR "You have requested C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION}. NVCC only supports C++17 from version 11 on. Please reduce the C++ standard to 14 or upgrade the compiler if you need C++17 support.") + ENDIF() + IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR) + MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON with C++17 support for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs. See https://github.com/kokkos/kokkos/issues/3496") ENDIF() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index 41ee10a8a0..445dad47ce 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -48,9 +48,6 @@ IF(KOKKOS_ENABLE_OPENMP) IF(KOKKOS_CLANG_IS_CRAY) SET(ClangOpenMPFlag -fopenmp) ENDIF() - IF(KOKKOS_CLANG_IS_INTEL) - SET(ClangOpenMPFlag -fiopenmp) - ENDIF() IF(KOKKOS_COMPILER_CLANG_MSVC) #for clang-cl expression /openmp yields an error, so directly add the specific Clang flag SET(ClangOpenMPFlag /clang:-fopenmp=libomp) @@ -64,6 +61,7 @@ IF(KOKKOS_ENABLE_OPENMP) COMPILER_SPECIFIC_FLAGS( COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID Clang -Xcompiler ${ClangOpenMPFlag} + IntelClang -Xcompiler -fiopenmp PGI -Xcompiler -mp Cray NO-VALUE-SPECIFIED XL -Xcompiler -qsmp=omp @@ -72,6 +70,7 @@ IF(KOKKOS_ENABLE_OPENMP) ELSE() COMPILER_SPECIFIC_FLAGS( Clang ${ClangOpenMPFlag} + IntelClang -fiopenmp AppleClang -Xpreprocessor -fopenmp PGI -mp Cray NO-VALUE-SPECIFIED @@ -152,3 +151,11 @@ IF (KOKKOS_ENABLE_HIP) ENDIF() KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") + +## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros +IF (KOKKOS_ENABLE_SYCL) + IF(KOKKOS_CXX_STANDARD LESS 17) + MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") + ENDIF() + LIST(APPEND DEVICE_SETUP_LIST SYCL) +ENDIF() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 5df498f373..95bce66c7b 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -48,6 +48,7 @@ KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler war KOKKOS_ENABLE_OPTION(PROFILING_LOAD_PRINT OFF "Whether to print information about which profiling tools got loaded") KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +KOKKOS_ENABLE_OPTION(LAUNCH_COMPILER ON "Whether to potentially use the launch compiler") IF (KOKKOS_ENABLE_CUDA) SET(KOKKOS_COMPILER_CUDA_VERSION "${KOKKOS_COMPILER_VERSION_MAJOR}${KOKKOS_COMPILER_VERSION_MINOR}") @@ -68,6 +69,15 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") +IF (KOKKOS_ENABLE_TESTS) + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +ELSE() + SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +ENDIF() +KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") +IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") +ENDIF() IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) SET(CUDA_CONSTEXPR_DEFAULT ON) @@ -76,14 +86,14 @@ ELSE() ENDIF() KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") + FUNCTION(check_device_specific_options) CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) FOREACH(OPTION ${SOME_OPTIONS}) - IF(CMAKE_VERSION VERSION_GREATER_EQUAL 3.14) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() + IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") ENDIF() IF(KOKKOS_ENABLE_${OPTION}) MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index 2b17d648b4..858322394d 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -169,9 +169,7 @@ MACRO(kokkos_export_imported_tpl NAME) ENDIF() SET(TPL_LINK_OPTIONS) - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13.0") - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - ENDIF() + GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) IF(TPL_LINK_OPTIONS) KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") ENDIF() @@ -230,9 +228,7 @@ MACRO(kokkos_import_tpl NAME) # I have still been getting errors about ROOT variables being ignored # I'm not sure if this is a scope issue - but make sure # the policy is set before we do any find_package calls - IF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.12.0") - CMAKE_POLICY(SET CMP0074 NEW) - ENDIF() + CMAKE_POLICY(SET CMP0074 NEW) IF (KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find @@ -314,7 +310,7 @@ MACRO(kokkos_create_imported_tpl NAME) CMAKE_PARSE_ARGUMENTS(TPL "INTERFACE" "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_OPTIONS;LINK_OPTIONS" + "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN}) @@ -334,6 +330,9 @@ MACRO(kokkos_create_imported_tpl NAME) IF(TPL_INCLUDES) TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + ENDIF() IF(TPL_COMPILE_OPTIONS) TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) ENDIF() @@ -355,6 +354,10 @@ MACRO(kokkos_create_imported_tpl NAME) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") ENDIF() + IF(TPL_COMPILE_DEFINITIONS) + SET_TARGET_PROPERTIES(${NAME} PROPERTIES + INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + ENDIF() IF(TPL_COMPILE_OPTIONS) SET_TARGET_PROPERTIES(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") @@ -770,7 +773,7 @@ FUNCTION(kokkos_link_tpl TARGET) ENDFUNCTION() FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIP Fujitsu) + SET(COMPILERS NVIDIA PGI XL DEFAULT Cray Intel Clang AppleClang IntelClang GNU HIPCC Fujitsu) CMAKE_PARSE_ARGUMENTS( PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" @@ -926,6 +929,9 @@ ENDFUNCTION() # DIRECTORY --> all files in directory # PROJECT --> all files/targets in a project/subproject # +# NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. +# This version explicitly uses nvcc_wrapper. +# FUNCTION(kokkos_compilation) # check whether the compiler already supports building CUDA KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) @@ -947,10 +953,21 @@ FUNCTION(kokkos_compilation) MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") ENDIF() + # find nvcc_wrapper + FIND_PROGRAM(Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin) + + IF(NOT Kokkos_COMPILE_LAUNCHER) + MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") + ENDIF() + IF(COMP_GLOBAL) # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ELSE() FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) @@ -961,8 +978,8 @@ FUNCTION(kokkos_compilation) # set the properties if defined IF(COMP_${_TYPE}) # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") + SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") ENDIF() ENDFOREACH() ENDIF() diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 1d7da922eb..707fb000af 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -86,6 +86,19 @@ ELSE() MESSAGE(FATAL_ERROR "Unknown C++ standard ${KOKKOS_CXX_STANDARD} - must be 14, 17, or 20") ENDIF() +# Enforce that we can compile a simple C++14 program + +TRY_COMPILE(CAN_COMPILE_CPP14 + ${KOKKOS_TOP_BUILD_DIR}/corner_cases + ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus14.cpp + OUTPUT_VARIABLE ERROR_MESSAGE + CXX_STANDARD 14 +) +if (NOT CAN_COMPILE_CPP14) + UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this + MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++14 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") +ENDIF() +UNSET(CAN_COMPILE_CPP14 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index b58d3696ea..d8d044c9d7 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -1,5 +1,6 @@ KOKKOS_CFG_DEPENDS(TPLS OPTIONS) KOKKOS_CFG_DEPENDS(TPLS DEVICES) +KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) CMAKE_PARSE_ARGUMENTS(PARSED @@ -38,6 +39,12 @@ IF(KOKKOS_ENABLE_MEMKIND) ENDIF() KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) KOKKOS_TPL_OPTION(LIBRT Off) +IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + SET(ROCM_DEFAULT ON) +ELSE() + SET(ROCM_DEFAULT OFF) +ENDIF() +KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) IF (WIN32) SET(LIBDL_DEFAULT Off) @@ -70,6 +77,7 @@ KOKKOS_IMPORT_TPL(LIBRT) KOKKOS_IMPORT_TPL(LIBDL) KOKKOS_IMPORT_TPL(MEMKIND) KOKKOS_IMPORT_TPL(PTHREAD INTERFACE) +KOKKOS_IMPORT_TPL(ROCM INTERFACE) #Convert list to newlines (which CMake doesn't always like in cache variables) STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 059fb192f0..afa036066a 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -141,39 +141,54 @@ FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) ENDFUNCTION() FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) -CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) -VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) + CMAKE_PARSE_ARGUMENTS(PARSE + "" + "" + "SOURCES;CATEGORIES;ARGS" + ${ARGN}) + VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) -IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS kokkos_gtest - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) -ENDIF() + IF (KOKKOS_HAS_TRILINOS) + IF(DEFINED PARSE_ARGS) + STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") + ENDIF() + TRIBITS_ADD_EXECUTABLE_AND_TEST( + ${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + TESTONLYLIBS kokkos_gtest + NUM_MPI_PROCS 1 + COMM serial mpi + ARGS ${PARSE_ARGS} + CATEGORIES ${PARSE_CATEGORIES} + SOURCES ${PARSE_SOURCES} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${PARSE_ARGS} + ) + ELSE() + KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} + SOURCES ${PARSE_SOURCES} + ) + IF (PARSE_ARGS) + SET(TEST_NUMBER 0) + FOREACH (ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + KOKKOS_ADD_TEST(NAME ${TEST_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ARGS ${ARG_STR_LIST} + ) + ENDFOREACH() + ELSE() + KOKKOS_ADD_TEST(NAME ${ROOT_NAME} + EXE ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION " FAILED " + ) + ENDIF() + ENDIF() ENDFUNCTION() FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) @@ -301,11 +316,26 @@ ENDMACRO() ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) - # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to nvcc_wrapper + + # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler + # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler + IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + ELSE() + IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + ENDIF() + ENDIF() + + CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler + ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler + @ONLY) + INSTALL(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" "${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR}) INSTALL(FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" @@ -313,7 +343,7 @@ MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_PostInclude.hpp" - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) + DESTINATION ${KOKKOS_HEADER_DIR}) ENDMACRO() FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) @@ -330,24 +360,12 @@ FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}> ) - ELSEIF(${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.13") + ELSE() #I can use link options #just assume CXX linkage TARGET_LINK_OPTIONS( ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} ) - ELSE() - #assume CXX linkage, we have no good way to check otherwise - IF (PARSE_PLAIN_STYLE) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} ${KOKKOS_LINK_OPTIONS} - ) - ELSE() - #well, have to do it the wrong way for now - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() ENDIF() TARGET_COMPILE_OPTIONS( @@ -448,6 +466,13 @@ FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) ${PARSE_SOURCES} ) + IF(PARSE_SHARED OR BUILD_SHARED_LIBS) + SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES + VERSION ${Kokkos_VERSION} + SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + ) + ENDIF() + KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index 7000624b6b..98655896d4 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -26,8 +26,6 @@ KOKKOS_ADD_LIBRARY( HEADERS ${KOKKOS_CONTAINER_HEADERS} ) -SET_TARGET_PROPERTIES(kokkoscontainers PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -36,4 +34,3 @@ KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- - diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index 689f0eb2ed..45710d1f73 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -91,6 +91,25 @@ namespace Kokkos { * behavior. Please see the documentation of Kokkos::View for * examples. The default suffices for most users. */ + +namespace Impl { + +#ifdef KOKKOS_ENABLE_CUDA + +inline const Kokkos::Cuda& get_cuda_space(const Kokkos::Cuda& in) { return in; } + +inline const Kokkos::Cuda& get_cuda_space() { + return *Kokkos::Impl::cuda_get_deep_copy_space(); +} + +template +inline const Kokkos::Cuda& get_cuda_space(const NonCudaExecSpace&) { + return get_cuda_space(); +} + +#endif // KOKKOS_ENABLE_CUDA + +} // namespace Impl template class DualView : public ViewTraits { @@ -295,6 +314,53 @@ class DualView : public ViewTraits { "DualView constructed with incompatible views"); } } + // does the DualView have only one device + struct impl_dualview_is_single_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the device of t_dev? + template + struct impl_device_matches_tdev_device { + enum : bool { + value = std::is_same::value + }; + }; + // does the given device match the device of t_host? + template + struct impl_device_matches_thost_device { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_host? + template + struct impl_device_matches_thost_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device match the execution space of t_dev? + template + struct impl_device_matches_tdev_exec { + enum : bool { + value = std::is_same::value + }; + }; + + // does the given device's memory space match the memory space of t_dev? + template + struct impl_device_matches_tdev_memory_space { + enum : bool { + value = std::is_same::value + }; + }; //@} //! \name Methods for synchronizing, marking as modified, and getting Views. @@ -302,7 +368,7 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the if_c expression in the return + /// Please don't be afraid of the nested if_c expressions in the return /// value's type. That just tells the method what the return type /// should be: t_dev if the \c Device template parameter matches /// this DualView's device type, else t_host. @@ -323,10 +389,17 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename Impl::if_c< - std::is_same::value, - t_dev, t_host>::type& + KOKKOS_INLINE_FUNCTION const typename std::conditional_t< + impl_device_matches_tdev_device::value, t_dev, + typename std::conditional_t< + impl_device_matches_thost_device::value, t_host, + typename std::conditional_t< + impl_device_matches_thost_exec::value, t_host, + typename std::conditional_t< + impl_device_matches_tdev_exec::value, t_dev, + typename std::conditional_t< + impl_device_matches_tdev_memory_space::value, + t_dev, t_host> > > > > view() const { constexpr bool device_is_memspace = std::is_same::value; @@ -463,6 +536,7 @@ class DualView : public ViewTraits { true); } } + /// \brief Update data on device or host only if data in the other /// space has been marked as modified. /// @@ -480,12 +554,9 @@ class DualView : public ViewTraits { /// the data in either View. You must manually mark modified data /// as modified, by calling the modify() method with the /// appropriate template parameter. - template - void sync(const typename std::enable_if< - (std::is_same::value) || - (std::is_same::value), - int>::type& = 0) { + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::true_type, Args const&... args) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -497,12 +568,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(0) = modified_flags(1) = 0; impl_report_device_sync(); } @@ -514,12 +585,12 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(0) = modified_flags(1) = 0; impl_report_host_sync(); } @@ -533,10 +604,26 @@ class DualView : public ViewTraits { template void sync(const typename std::enable_if< - (!std::is_same::value) || + (std::is_same::value) || (std::is_same::value), int>::type& = 0) { + sync_impl(std::true_type{}); + } + + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::true_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_impl(std::false_type, Args const&...) { if (modified_flags.data() == nullptr) return; int dev = get_device_side(); @@ -557,7 +644,27 @@ class DualView : public ViewTraits { } } - void sync_host() { + template + void sync(const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}); + } + template + void sync(const ExecutionSpace& exec, + const typename std::enable_if< + (!std::is_same::value) || + (std::is_same::value), + int>::type& = 0) { + sync_impl(std::false_type{}, exec); + } + + // deliberately passing args by cref as they're used multiple times + template + void sync_host_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -569,18 +676,26 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), false); } #endif - deep_copy(h_view, d_view); + deep_copy(args..., h_view, d_view); modified_flags(1) = modified_flags(0) = 0; impl_report_host_sync(); } } - void sync_device() { + template + void sync_host(const ExecSpace& exec) { + sync_host_impl(exec); + } + void sync_host() { sync_host_impl(); } + + // deliberately passing args by cref as they're used multiple times + template + void sync_device_impl(Args const&... args) { if (!std::is_same::value) Impl::throw_runtime_exception( @@ -592,17 +707,23 @@ class DualView : public ViewTraits { Kokkos::CudaUVMSpace>::value) { if (d_view.data() == h_view.data()) Kokkos::Impl::cuda_prefetch_pointer( - Kokkos::Cuda(), d_view.data(), + Impl::get_cuda_space(args...), d_view.data(), sizeof(typename t_dev::value_type) * d_view.span(), true); } #endif - deep_copy(d_view, h_view); + deep_copy(args..., d_view, h_view); modified_flags(1) = modified_flags(0) = 0; impl_report_device_sync(); } } + template + void sync_device(const ExecSpace& exec) { + sync_device_impl(exec); + } + void sync_device() { sync_device_impl(); } + template bool need_sync() const { if (modified_flags.data() == nullptr) return false; @@ -658,6 +779,7 @@ class DualView : public ViewTraits { template void modify() { if (modified_flags.data() == nullptr) return; + if (impl_dualview_is_single_device::value) return; int dev = get_device_side(); if (dev == 1) { // if Device is the same as DualView's device type @@ -690,6 +812,7 @@ class DualView : public ViewTraits { } inline void modify_host() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(0) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) @@ -710,6 +833,7 @@ class DualView : public ViewTraits { } inline void modify_device() { + if (impl_dualview_is_single_device::value) return; if (modified_flags.data() != nullptr) { modified_flags(1) = (modified_flags(1) > modified_flags(0) ? modified_flags(1) diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index c66d7a5f36..c6323fef93 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -245,13 +245,10 @@ KOKKOS_INLINE_FUNCTION bool dyn_rank_view_verify_operator_bounds( return (size_t(i) < map.extent(R)) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else if (i != 0) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf( + KOKKOS_IMPL_DO_NOT_USE_PRINTF( "DynRankView Debug Bounds Checking Error: at rank %u\n Extra " "arguments beyond the rank must be zero \n", R); -#endif return (false) && dyn_rank_view_verify_operator_bounds(rank, map, args...); } else { @@ -575,37 +572,22 @@ class DynRankView : public ViewTraits { (is_layout_left || is_layout_right || is_layout_stride) }; - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::DynRankView ERROR: attempt to access inaccessible memory " - "space"); - }; - }; - // Bounds checking macros #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) // rank of the calling operator - included as first argument in ARG -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Impl::dyn_rank_view_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ - DynRankView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index 06bd556661..cc949d4c55 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -76,6 +76,12 @@ struct ChunkArraySpace { using memory_space = typename Kokkos::Experimental::HIPHostPinnedSpace; }; #endif +#ifdef KOKKOS_ENABLE_SYCL +template <> +struct ChunkArraySpace { + using memory_space = typename Kokkos::Experimental::SYCLSharedUSMSpace; +}; +#endif } // end namespace Impl /** \brief Dynamic views are restricted to rank-one and no layout. diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 4fd084338e..0f21a08ba3 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -377,34 +377,20 @@ class OffsetView : public ViewTraits { std::is_same::value && (is_layout_left || is_layout_right || is_layout_stride); - template ::accessible> - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() {} - }; - - template - struct verify_space { - KOKKOS_FORCEINLINE_FUNCTION static void check() { - Kokkos::abort( - "Kokkos::View ERROR: attempt to access inaccessible memory space"); - }; - }; - #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); \ + Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ + typename traits::memory_space> \ ARG; #else -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - OffsetView::template verify_space< \ - Kokkos::Impl::ActiveExecutionMemorySpace>::check(); +#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ + Kokkos::Impl::verify_space::check(); #endif public: diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 5e18f5a80e..dcd4cf73e5 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -649,13 +649,13 @@ struct ReduceDuplicatesBase { size_t stride; size_t start; size_t n; - ReduceDuplicatesBase(ValueType const* src_in, ValueType* dest_in, - size_t stride_in, size_t start_in, size_t n_in, - std::string const& name) + ReduceDuplicatesBase(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dest_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) : src(src_in), dst(dest_in), stride(stride_in), start(start_in), n(n_in) { parallel_for( std::string("Kokkos::ScatterView::ReduceDuplicates [") + name + "]", - RangePolicy(0, stride), + RangePolicy(exec_space, 0, stride), static_cast(*this)); } }; @@ -667,9 +667,10 @@ template struct ReduceDuplicates : public ReduceDuplicatesBase { using Base = ReduceDuplicatesBase; - ReduceDuplicates(ValueType const* src_in, ValueType* dst_in, size_t stride_in, - size_t start_in, size_t n_in, std::string const& name) - : Base(src_in, dst_in, stride_in, start_in, n_in, name) {} + ReduceDuplicates(ExecSpace const& exec_space, ValueType const* src_in, + ValueType* dst_in, size_t stride_in, size_t start_in, + size_t n_in, std::string const& name) + : Base(exec_space, src_in, dst_in, stride_in, start_in, n_in, name) {} KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { for (size_t j = Base::start; j < Base::n; ++j) { ScatterValue struct ResetDuplicatesBase { using Derived = ResetDuplicates; ValueType* data; - ResetDuplicatesBase(ValueType* data_in, size_t size_in, - std::string const& name) + ResetDuplicatesBase(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) : data(data_in) { parallel_for( std::string("Kokkos::ScatterView::ResetDuplicates [") + name + "]", - RangePolicy(0, size_in), + RangePolicy(exec_space, 0, size_in), static_cast(*this)); } }; @@ -703,8 +704,9 @@ struct ResetDuplicatesBase { template struct ResetDuplicates : public ResetDuplicatesBase { using Base = ResetDuplicatesBase; - ResetDuplicates(ValueType* data_in, size_t size_in, std::string const& name) - : Base(data_in, size_in, name) {} + ResetDuplicates(ExecSpace const& exec_space, ValueType* data_in, + size_t size_in, std::string const& name) + : Base(exec_space, data_in, size_in, name) {} KOKKOS_FORCEINLINE_FUNCTION void operator()(size_t i) const { ScatterValue @@ -713,6 +715,16 @@ struct ResetDuplicates : public ResetDuplicatesBase { } }; +template +void check_scatter_view_allocation_properties_argument( + ViewCtorProp const&) { + static_assert(ViewCtorProp::has_execution_space && + ViewCtorProp::has_label && + ViewCtorProp::initialize, + "Allocation property must have an execution name as well as a " + "label, and must perform the view initialization"); +} + } // namespace Experimental } // namespace Impl } // namespace Kokkos @@ -762,10 +774,26 @@ class ScatterView const& original_view) : internal_view(original_view) {} + template + ScatterView(execution_space const& /* exec_space */, + View const& original_view) + : internal_view(original_view) {} + template ScatterView(std::string const& name, Dims... dims) : internal_view(name, dims...) {} + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, Dims... dims) + : internal_view(arg_prop, dims...) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + } + template KOKKOS_FUNCTION ScatterView( const ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert(std::is_same::value, "ScatterView contribute destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView contribute destination memory space not accessible"); if (dest.data() == internal_view.data()) return; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), 0, 0, 1, internal_view.label()); + exec_space, internal_view.data(), dest.data(), 0, 0, 1, + internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } template void reset_except(View const& view) { - if (view.data() != internal_view.data()) reset(); + reset_except(execution_space(), view); + } + + template + void reset_except(const execution_space& exec_space, + View const& view) { + if (view.data() != internal_view.data()) reset(exec_space); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -928,10 +970,16 @@ class ScatterView ScatterView(View const& original_view) + : ScatterView(execution_space(), original_view) {} + + template + ScatterView(execution_space const& exec_space, + View const& original_view) : unique_token(), internal_view( view_alloc(WithoutInitializing, - std::string("duplicated_") + original_view.label()), + std::string("duplicated_") + original_view.label(), + exec_space), unique_token.size(), original_view.rank_dynamic > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -949,14 +997,32 @@ class ScatterView ScatterView(std::string const& name, Dims... dims) - : internal_view(view_alloc(WithoutInitializing, name), + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, Dims... dims) + : internal_view(view_alloc(WithoutInitializing, + static_cast<::Kokkos::Impl::ViewCtorProp< + void, std::string> const&>(arg_prop) + .value), unique_token.size(), dims...) { - reset(); + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp const&>( + arg_prop) + .value; + reset(exec_space); } template @@ -984,37 +1050,51 @@ class ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert(std::is_same::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); bool is_equal = (dest.data() == internal_view.data()); size_t start = is_equal ? 1 : 0; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), internal_view.stride(0), start, - internal_view.extent(0), internal_view.label()); + exec_space, internal_view.data(), dest.data(), internal_view.stride(0), + start, internal_view.extent(0), internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } + template void reset_except(View const& view) { + reset_except(execution_space(), view); + } + + template + void reset_except(execution_space const& exec_space, + View const& view) { if (view.data() != internal_view.data()) { - reset(); + reset(exec_space); return; } Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data() + view.size(), internal_view.size() - view.size(), - internal_view.label()); + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -1075,7 +1155,13 @@ class ScatterView - ScatterView(View const& original_view) : unique_token() { + ScatterView(View const& original_view) + : ScatterView(execution_space(), original_view) {} + + template + ScatterView(execution_space const& exec_space, + View const& original_view) + : unique_token() { size_t arg_N[8] = {original_view.rank > 0 ? original_view.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, original_view.rank > 1 ? original_view.extent(1) @@ -1094,14 +1180,27 @@ class ScatterView - ScatterView(std::string const& name, Dims... dims) { + ScatterView(std::string const& name, Dims... dims) + : ScatterView(view_alloc(execution_space(), name), dims...) {} + + // This overload allows specifying an execution space instance to be + // used by passing, e.g., Kokkos::view_alloc(exec_space, "label") as + // first argument. + template + ScatterView(::Kokkos::Impl::ViewCtorProp const& arg_prop, + Dims... dims) { + using ::Kokkos::Impl::Experimental:: + check_scatter_view_allocation_properties_argument; + check_scatter_view_allocation_properties_argument(arg_prop); + original_view_type original_view; size_t arg_N[8] = {original_view.rank > 0 ? original_view.static_extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -1120,10 +1219,20 @@ class ScatterView const&>( + arg_prop) + .value; internal_view = internal_view_type(view_alloc(WithoutInitializing, name), arg_N[0], arg_N[1], arg_N[2], arg_N[3], arg_N[4], arg_N[5], arg_N[6], arg_N[7]); - reset(); + + auto const exec_space = + static_cast<::Kokkos::Impl::ViewCtorProp const&>( + arg_prop) + .value; + reset(exec_space); } template @@ -1166,6 +1275,12 @@ class ScatterView void contribute_into(View const& dest) const { + contribute_into(execution_space(), dest); + } + + template + void contribute_into(execution_space const& exec_space, + View const& dest) const { using dest_type = View; static_assert( std::is_same::value, "ScatterView deep_copy destination has different layout"); static_assert( - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< - memory_space, typename dest_type::memory_space>::value, + Kokkos::Impl::SpaceAccessibility< + execution_space, typename dest_type::memory_space>::accessible, "ScatterView deep_copy destination memory space not accessible"); auto extent = internal_view.extent(internal_view_type::rank - 1); bool is_equal = (dest.data() == internal_view.data()); size_t start = is_equal ? 1 : 0; Kokkos::Impl::Experimental::ReduceDuplicates( - internal_view.data(), dest.data(), + exec_space, internal_view.data(), dest.data(), internal_view.stride(internal_view_type::rank - 1), start, extent, internal_view.label()); } - void reset() { + void reset(execution_space const& exec_space = execution_space()) { Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data(), internal_view.size(), internal_view.label()); + exec_space, internal_view.data(), internal_view.size(), + internal_view.label()); } + template void reset_except(View const& view) { + reset_except(execution_space(), view); + } + + template + void reset_except(execution_space const& exec_space, + View const& view) { if (view.data() != internal_view.data()) { - reset(); + reset(exec_space); return; } Kokkos::Impl::Experimental::ResetDuplicates( - internal_view.data() + view.size(), internal_view.size() - view.size(), - internal_view.label()); + exec_space, internal_view.data() + view.size(), + internal_view.size() - view.size(), internal_view.label()); } void resize(const size_t n0 = 0, const size_t n1 = 0, const size_t n2 = 0, @@ -1316,21 +1439,21 @@ template ::array_layout, typename ViewTraits::device_type, Op, - typename Kokkos::Impl::if_c< + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type, - typename Kokkos::Impl::if_c< + Duplication>, + std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, - typename Kokkos::Impl::if_c< + typename std::conditional_t< std::is_same::value, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, - Duplication>::type>::type, - Contribution>::type> + Duplication>>::type, + Contribution>> create_scatter_view(View const& original_view) { return original_view; // implicit ScatterView constructor call } @@ -1365,12 +1488,21 @@ create_scatter_view(Op, Duplication, Contribution, namespace Kokkos { namespace Experimental { +template +void contribute( + typename ES::execution_space const& exec_space, View& dest, + Kokkos::Experimental::ScatterView const& src) { + src.contribute_into(exec_space, dest); +} + template void contribute( View& dest, Kokkos::Experimental::ScatterView const& src) { - src.contribute_into(dest); + using execution_space = typename ES::execution_space; + contribute(execution_space{}, dest, src); } } // namespace Experimental diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index d2affda93a..edb0e7261d 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -264,26 +264,24 @@ class UnorderedMap { private: enum : size_type { invalid_index = ~static_cast(0) }; - using impl_value_type = - typename Impl::if_c::type; + using impl_value_type = std::conditional_t; - using key_type_view = typename Impl::if_c< + using key_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; - using value_type_view = - typename Impl::if_c, - View > >::type; + using value_type_view = std::conditional_t< + is_insertable_map || is_modifiable_map, + View, + View > >; - using size_type_view = typename Impl::if_c< + using size_type_view = std::conditional_t< is_insertable_map, View, - View > >::type; + View > >; using bitset_type = - typename Impl::if_c, - ConstBitset >::type; + std::conditional_t, + ConstBitset >; enum { modified_idx = 0, erasable_idx = 1, failed_insert_idx = 2 }; enum { num_scalars = 3 }; @@ -540,10 +538,7 @@ class UnorderedMap { // Previously claimed an unused entry that was not inserted. // Release this unused entry immediately. if (!m_available_indexes.reset(new_index)) { - // FIXME_SYCL SYCL doesn't allow printf in kernels -#ifndef KOKKOS_ENABLE_SYCL - printf("Unable to free existing\n"); -#endif + KOKKOS_IMPL_DO_NOT_USE_PRINTF("Unable to free existing\n"); } } @@ -659,8 +654,8 @@ class UnorderedMap { /// /// 'const value_type' via Cuda texture fetch must return by value. KOKKOS_FORCEINLINE_FUNCTION - typename Impl::if_c<(is_set || has_const_value), impl_value_type, - impl_value_type &>::type + std::conditional_t<(is_set || has_const_value), impl_value_type, + impl_value_type &> value_at(size_type i) const { return m_values[is_set ? 0 : (i < capacity() ? i : capacity())]; } diff --git a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp index 6e450598d1..6047e60f3d 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_Bitset_impl.hpp @@ -57,10 +57,22 @@ namespace Kokkos { namespace Impl { +KOKKOS_FORCEINLINE_FUNCTION +unsigned rotate_left(unsigned i, int r) { + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + return r ? ((i << r) | (i >> (size - r))) : i; +} + KOKKOS_FORCEINLINE_FUNCTION unsigned rotate_right(unsigned i, int r) { - enum { size = static_cast(sizeof(unsigned) * CHAR_BIT) }; + constexpr int size = static_cast(sizeof(unsigned) * CHAR_BIT); + // FIXME_SYCL llvm.fshr.i32 missing + // (https://github.com/intel/llvm/issues/3308) +#ifdef __SYCL_DEVICE_ONLY__ + return rotate_left(i, size - r); +#else return r ? ((i >> r) | (i << (size - r))) : i; +#endif } template diff --git a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp index b06ab0846c..d7c4a5d1ff 100644 --- a/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp +++ b/lib/kokkos/containers/src/impl/Kokkos_UnorderedMap_impl.hpp @@ -250,8 +250,8 @@ struct UnorderedMapPrint { uint32_t list = m_map.m_hash_lists(i); for (size_type curr = list, ii = 0; curr != invalid_index; curr = m_map.m_next_index[curr], ++ii) { - printf("%d[%d]: %d->%d\n", list, ii, m_map.key_at(curr), - m_map.value_at(curr)); + KOKKOS_IMPL_DO_NOT_USE_PRINTF("%d[%d]: %d->%d\n", list, ii, + m_map.key_at(curr), m_map.value_at(curr)); } } }; diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index c84c5f6d5e..947d222c27 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -2,6 +2,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) # Because there is always an exception to the rule @@ -41,11 +42,6 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() - list(REMOVE_ITEM UnitTestSources - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_Bitset.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_ScatterView.cpp - ${CMAKE_CURRENT_BINARY_DIR}/sycl/TestSYCL_UnorderedMap.cpp - ) KOKKOS_ADD_EXECUTABLE_AND_TEST(UnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() diff --git a/lib/kokkos/containers/unit_tests/Makefile b/lib/kokkos/containers/unit_tests/Makefile index f42b9b7519..82669fe1ab 100644 --- a/lib/kokkos/containers/unit_tests/Makefile +++ b/lib/kokkos/containers/unit_tests/Makefile @@ -26,7 +26,7 @@ override LDFLAGS += -lpthread include $(KOKKOS_PATH)/Makefile.kokkos -KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests +KOKKOS_CXXFLAGS += -I$(GTEST_PATH) -I${KOKKOS_PATH}/containers/unit_tests -I${KOKKOS_PATH}/core/unit_test/category_files TEST_TARGETS = TARGETS = diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 531caf0f85..3eee85ed10 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -114,6 +114,8 @@ struct test_dualview_combinations { a.template modify(); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); a.h_view(5, 1) = 3; a.h_view(6, 1) = 4; @@ -122,11 +124,15 @@ struct test_dualview_combinations { ViewType b = Kokkos::subview(a, std::pair(6, 9), std::pair(0, 1)); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); b.template modify(); Kokkos::deep_copy(b.d_view, 2); a.template sync(); + a.template sync( + Kokkos::DefaultExecutionSpace{}); Scalar count = 0; for (unsigned int i = 0; i < a.d_view.extent(0); i++) for (unsigned int j = 0; j < a.d_view.extent(1); j++) @@ -180,6 +186,7 @@ struct test_dual_view_deep_copy { } else { a.modify_device(); a.sync_host(); + a.sync_host(Kokkos::DefaultExecutionSpace{}); } // Check device view is initialized as expected @@ -208,6 +215,7 @@ struct test_dual_view_deep_copy { b.template sync(); } else { b.sync_host(); + b.sync_host(Kokkos::DefaultExecutionSpace{}); } // Perform same checks on b as done on a @@ -302,6 +310,7 @@ struct test_dualview_resize { ASSERT_EQ(a.extent(1), m / factor); a.sync_device(); + a.sync_device(Kokkos::DefaultExecutionSpace{}); // Check device view is initialized as expected a_d_sum = 0; @@ -404,19 +413,14 @@ void test_dualview_resize() { Impl::test_dualview_resize(); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combination) { test_dualview_combinations(10, true); } -#endif TEST(TEST_CATEGORY, dualview_alloc) { test_dualview_alloc(10); } -// FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dualview_combinations_without_init) { test_dualview_combinations(10, false); } @@ -433,8 +437,133 @@ TEST(TEST_CATEGORY, dualview_realloc) { TEST(TEST_CATEGORY, dualview_resize) { test_dualview_resize(); } + +namespace { +/** + * + * The following tests are a response to + * https://github.com/kokkos/kokkos/issues/3850 + * and + * https://github.com/kokkos/kokkos/pull/3857 + * + * DualViews were returning incorrect view types and taking + * inappropriate actions based on the templated view methods. + * + * Specifically, template view methods were always returning + * a device view if the memory space was UVM and a Kokkos::Device was passed. + * Sync/modify methods completely broke down So these tests exist to make sure + * that we keep the semantics of UVM DualViews intact. + */ +// modify if we have other UVM enabled backends +#ifdef KOKKOS_ENABLE_CUDA // OR other UVM builds +#define UVM_ENABLED_BUILD #endif +#ifdef UVM_ENABLED_BUILD +template +struct UVMSpaceFor; +#endif + +#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA +template <> +struct UVMSpaceFor { + using type = Kokkos::CudaUVMSpace; +}; +#endif + +#ifdef UVM_ENABLED_BUILD +template <> +struct UVMSpaceFor { + using type = typename UVMSpaceFor::type; +}; +#else +template +struct UVMSpaceFor { + using type = typename ExecSpace::memory_space; +}; +#endif + +using ExecSpace = Kokkos::DefaultExecutionSpace; +using MemSpace = typename UVMSpaceFor::type; +using DeviceType = Kokkos::Device; + +using DualViewType = Kokkos::DualView; +using d_device = DeviceType; +using h_device = Kokkos::Device< + Kokkos::DefaultHostExecutionSpace, + typename UVMSpaceFor::type>; + +TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_d = dv.template view(); + using vdt = decltype(v_d); + using vdt_d = vdt::device_type; + using vdt_d_e = vdt_d::execution_space; + ASSERT_STREQ(vdt_d_e::name(), Kokkos::DefaultExecutionSpace::name()); +} +TEST(TEST_CATEGORY, dualview_host_correct_kokkos_device) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + auto v_h = dv.template view(); + using vht = decltype(v_h); + using vht_d = vht::device_type; + using vht_d_e = vht_d::execution_space; + ASSERT_STREQ(vht_d_e::name(), Kokkos::DefaultHostExecutionSpace::name()); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_host_modify_template_device_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_host(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, dualview_device_modify_template_host_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} +TEST(TEST_CATEGORY, dualview_device_modify_template_host_execspace_sync) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + dv.modify_device(); + dv.template sync(); + EXPECT_TRUE(!dv.need_sync_device()); + EXPECT_TRUE(!dv.need_sync_host()); + dv.clear_sync_state(); +} + +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_executionspace_views) { + DualViewType dv("myView", 100); + dv.clear_sync_state(); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); + ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), + dvt::device_type::execution_space::name()); + ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), + hvt::device_type::execution_space::name()); +} + +} // anonymous namespace } // namespace Test #endif // KOKKOS_TEST_DUALVIEW_HPP diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index 4b9f994417..f018793dd6 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -243,8 +243,6 @@ struct TestDynamicView { } }; -// FIXME_SYCL needs resize_serial -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, dynamic_view) { using TestDynView = TestDynamicView; @@ -252,7 +250,6 @@ TEST(TEST_CATEGORY, dynamic_view) { TestDynView::run(100000 + 100 * i); } } -#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestHIP_Category.hpp b/lib/kokkos/containers/unit_tests/TestHIP_Category.hpp deleted file mode 100644 index c2d60d1814..0000000000 --- a/lib/kokkos/containers/unit_tests/TestHIP_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_HIP_HPP -#define KOKKOS_TEST_HIP_HPP - -#define TEST_CATEGORY hip -#define TEST_EXECSPACE Kokkos::Experimental::HIP - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp b/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp deleted file mode 100644 index 64fc7c0757..0000000000 --- a/lib/kokkos/containers/unit_tests/TestHPX_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_HPX_HPP -#define KOKKOS_TEST_HPX_HPP - -#define TEST_CATEGORY hpx -#define TEST_EXECSPACE Kokkos::Experimental::HPX - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index 802813b13b..9ddc226e29 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -130,8 +130,6 @@ void test_offsetview_construction() { } } - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL const int ovmin0 = ov.begin(0); const int ovend0 = ov.end(0); const int ovmin1 = ov.begin(1); @@ -178,7 +176,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif #endif { @@ -215,8 +212,6 @@ void test_offsetview_construction() { point3_type{{extent0, extent1, extent2}}); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifdef KOKKOS_ENABLE_SYCL int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -239,7 +234,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif #endif } view_type viewFromOV = ov.view(); @@ -266,8 +260,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, ov); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -277,7 +269,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif #endif } @@ -288,8 +279,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(ov, aView); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -299,7 +288,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif #endif } } @@ -471,8 +459,6 @@ void test_offsetview_subview() { ASSERT_EQ(offsetSubview.end(1), 9); #if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) - // FIXME_SYCL requires MDRange policy -#ifndef KOKKOS_ENABLE_SYCL using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -498,7 +484,6 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif #endif } @@ -701,12 +686,9 @@ void test_offsetview_offsets_rank3() { } #endif -// FIXME_SYCL needs MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); } -#endif TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); diff --git a/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp b/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp deleted file mode 100644 index a0169d1702..0000000000 --- a/lib/kokkos/containers/unit_tests/TestOpenMP_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_OPENMP_HPP -#define KOKKOS_TEST_OPENMP_HPP - -#define TEST_CATEGORY openmp -#define TEST_EXECSPACE Kokkos::OpenMP - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp b/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp deleted file mode 100644 index 51fd3fc911..0000000000 --- a/lib/kokkos/containers/unit_tests/TestSYCL_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SYCL_HPP -#define KOKKOS_TEST_SYCL_HPP - -#define TEST_CATEGORY sycl -#define TEST_EXECSPACE Kokkos::Experimental::SYCL - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 3a3cb607a6..fdbce2d492 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -437,6 +437,10 @@ struct test_scatter_view_config { Contribution, Op, NumberType>::orig_view_type; + void compile_constructor() { + auto sv = scatter_view_def(Kokkos::view_alloc(DeviceType{}, "label"), 10); + } + void run_test(int n) { // test allocation { diff --git a/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp b/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp deleted file mode 100644 index 2aa09a315a..0000000000 --- a/lib/kokkos/containers/unit_tests/TestSerial_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_SERIAL_HPP -#define KOKKOS_TEST_SERIAL_HPP - -#define TEST_CATEGORY serial -#define TEST_EXECSPACE Kokkos::Serial - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp index 8bb267ce5d..a9a178f95e 100644 --- a/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp +++ b/lib/kokkos/containers/unit_tests/TestStaticCrsGraph.hpp @@ -285,10 +285,7 @@ void run_test_graph4() { TEST(TEST_CATEGORY, staticcrsgraph) { TestStaticCrsGraph::run_test_graph(); - // FIXME_SYCL requires MDRangePolicy -#ifndef KOKKOS_ENABLE_SYCL TestStaticCrsGraph::run_test_graph2(); -#endif TestStaticCrsGraph::run_test_graph3(1, 0); TestStaticCrsGraph::run_test_graph3(1, 1000); TestStaticCrsGraph::run_test_graph3(1, 10000); diff --git a/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp b/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp deleted file mode 100644 index 74a2b0da36..0000000000 --- a/lib/kokkos/containers/unit_tests/TestThreads_Category.hpp +++ /dev/null @@ -1,51 +0,0 @@ -/* -//@HEADER -// ************************************************************************ -// -// Kokkos v. 3.0 -// Copyright (2020) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Redistribution and use in source and binary forms, with or without -// modification, are permitted provided that the following conditions are -// met: -// -// 1. Redistributions of source code must retain the above copyright -// notice, this list of conditions and the following disclaimer. -// -// 2. Redistributions in binary form must reproduce the above copyright -// notice, this list of conditions and the following disclaimer in the -// documentation and/or other materials provided with the distribution. -// -// 3. Neither the name of the Corporation nor the names of the -// contributors may be used to endorse or promote products derived from -// this software without specific prior written permission. -// -// THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY -// EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR -// PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE -// CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, -// EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, -// PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR -// PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF -// LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING -// NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS -// SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -// -// Questions? Contact Christian R. Trott (crtrott@sandia.gov) -// -// ************************************************************************ -//@HEADER -*/ - -#ifndef KOKKOS_TEST_THREADS_HPP -#define KOKKOS_TEST_THREADS_HPP - -#define TEST_CATEGORY threads -#define TEST_EXECSPACE Kokkos::Threads - -#endif diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index d39e0061c7..4413cfbc80 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -163,7 +163,8 @@ struct TestFind { KOKKOS_INLINE_FUNCTION void operator()(typename execution_space::size_type i, value_type &errors) const { - const bool expect_to_find_i = (i < m_max_key); + const bool expect_to_find_i = + (i < typename execution_space::size_type(m_max_key)); const bool exists = m_map.exists(i); @@ -293,10 +294,11 @@ void test_deep_copy(uint32_t num_nodes) { } } -// FIXME_HIP wrong result in CI but works locally -#ifndef KOKKOS_ENABLE_HIP +// FIXME_SYCL wrong results on Nvidia GPUs but correct on Host and Intel GPUs +// FIXME_HIP // WORKAROUND MSVC -#ifndef _WIN32 +#if !(defined(KOKKOS_ENABLE_HIP) && (HIP_VERSION < 401)) && \ + !defined(_WIN32) && !defined(KOKKOS_ENABLE_SYCL) TEST(TEST_CATEGORY, UnorderedMap_insert) { for (int i = 0; i < 500; ++i) { test_insert(100000, 90000, 100, true); @@ -304,7 +306,6 @@ TEST(TEST_CATEGORY, UnorderedMap_insert) { } } #endif -#endif TEST(TEST_CATEGORY, UnorderedMap_failed_insert) { for (int i = 0; i < 1000; ++i) test_failed_insert(10000); diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index b7b817c910..9ff4b6006d 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -9,6 +9,14 @@ # that in TriBITS KokkosAlgorithms can be disabled... #INCLUDE_DIRECTORIES("${CMAKE_CURRENT_SOURCE_DIR}/../../algorithms/src") +# FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. +IF (KOKKOS_ENABLE_OPENMPTARGET + AND (KOKKOS_CXX_COMPILER_ID STREQUAL PGI + OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC)) + RETURN() +ENDIF() + + SET(SOURCES PerfTestMain.cpp PerfTestGramSchmidt.cpp @@ -68,8 +76,7 @@ KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) # This test currently times out for MSVC -# FIXME_SYCL these tests don't compile yet (require parallel_for). -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) +IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") KOKKOS_ADD_EXECUTABLE_AND_TEST( PerfTestExec SOURCES ${SOURCES} @@ -77,13 +84,11 @@ IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC" AND NOT Kokkos_ENABLE_SYCL) ) ENDIF() -# FIXME_SYCL -IF(NOT Kokkos_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_Atomic - SOURCES test_atomic.cpp - CATEGORIES PERFORMANCE - ) +KOKKOS_ADD_EXECUTABLE_AND_TEST( + PerformanceTest_Atomic + SOURCES test_atomic.cpp + CATEGORIES PERFORMANCE +) IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) KOKKOS_ADD_EXECUTABLE_AND_TEST( @@ -98,7 +103,6 @@ KOKKOS_ADD_EXECUTABLE_AND_TEST( SOURCES test_mempool.cpp CATEGORIES PERFORMANCE ) -ENDIF() IF(NOT Kokkos_ENABLE_OPENMPTARGET) # FIXME OPENMPTARGET needs tasking diff --git a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp index 70186283c1..dee21fd7a5 100644 --- a/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp +++ b/lib/kokkos/core/perf_test/PerfTestGramSchmidt.cpp @@ -69,7 +69,7 @@ struct InvNorm2 : public Kokkos::DotSingle { KOKKOS_INLINE_FUNCTION void final(value_type& result) const { - result = std::sqrt(result); + result = Kokkos::Experimental::sqrt(result); Rjj() = result; inv() = (0 < result) ? 1.0 / result : 0; } @@ -145,7 +145,7 @@ struct ModifiedGramSchmidt { // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ; Kokkos::scale(tmp, Qj); - for (size_t k = j + 1; k < count; ++k) { + for (size_type k = j + 1; k < count; ++k) { const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k); const value_view Rjk = Kokkos::subview(R_, j, k); @@ -165,7 +165,7 @@ struct ModifiedGramSchmidt { //-------------------------------------------------------------------------- - static double test(const size_t length, const size_t count, + static double test(const size_type length, const size_type count, const size_t iter = 1) { multivector_type Q_("Q", length, count); multivector_type R_("R", count, count); diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index e0590a78a4..2ab0989805 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -72,8 +72,6 @@ KOKKOS_ADD_LIBRARY( ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -SET_TARGET_PROPERTIES(kokkoscore PROPERTIES VERSION ${Kokkos_VERSION}) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} @@ -87,3 +85,4 @@ KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBRT) KOKKOS_LINK_TPL(kokkoscore PUBLIC PTHREAD) +KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 4a30c914f0..916f109758 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -45,6 +45,10 @@ #include #ifdef KOKKOS_ENABLE_CUDA +#include +#include +#include + #include #include #include @@ -52,10 +56,6 @@ #include #include -#include -#include -#include - //#include #include #include @@ -65,6 +65,22 @@ /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ +cudaStream_t Kokkos::Impl::cuda_get_deep_copy_stream() { + static cudaStream_t s = nullptr; + if (s == nullptr) { + cudaStreamCreate(&s); + } + return s; +} + +const std::unique_ptr &Kokkos::Impl::cuda_get_deep_copy_space( + bool initialize) { + static std::unique_ptr space = nullptr; + if (!space && initialize) + space = std::make_unique(Kokkos::Impl::cuda_get_deep_copy_stream()); + return space; +} + namespace Kokkos { namespace Impl { @@ -72,13 +88,6 @@ namespace { static std::atomic num_uvm_allocations(0); -cudaStream_t get_deep_copy_stream() { - static cudaStream_t s = nullptr; - if (s == nullptr) { - cudaStreamCreate(&s); - } - return s; -} } // namespace DeepCopy::DeepCopy(void *dst, const void *src, @@ -115,7 +124,7 @@ DeepCopy::DeepCopy(const Cuda &instance, void *dst, } void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { - cudaStream_t s = get_deep_copy_stream(); + cudaStream_t s = cuda_get_deep_copy_stream(); CUDA_SAFE_CALL(cudaMemcpyAsync(dst, src, n, cudaMemcpyDefault, s)); cudaStreamSynchronize(s); } @@ -128,14 +137,14 @@ void DeepCopyAsyncCuda(void *dst, const void *src, size_t n) { namespace Kokkos { -void CudaSpace::access_error() { +KOKKOS_DEPRECATED void CudaSpace::access_error() { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); Kokkos::Impl::throw_runtime_exception(msg); } -void CudaSpace::access_error(const void *const) { +KOKKOS_DEPRECATED void CudaSpace::access_error(const void *const) { const std::string msg( "Kokkos::CudaSpace::access_error attempt to execute Cuda function from " "non-Cuda space"); @@ -459,79 +468,6 @@ SharedAllocationRecord::attach_texture_object( return tex_obj; } -//============================================================================== -// {{{1 - -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() - const { - return std::string(RecordBase::head()->m_label); -} - -std::string -SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -// end SharedAllocationRecord::get_label() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_label, - const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord - *SharedAllocationRecord::allocate( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -// end SharedAllocationRecord allocate() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord *arg_rec) { - delete static_cast(arg_rec); -} - -// end SharedAllocationRecord deallocate }}}1 -//============================================================================== - //============================================================================== // {{{1 @@ -580,7 +516,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -592,13 +528,7 @@ SharedAllocationRecord::SharedAllocationRecord( SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast *>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy(RecordBase::m_alloc_ptr, &header, @@ -611,7 +541,7 @@ SharedAllocationRecord::SharedAllocationRecord( const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, #endif @@ -620,16 +550,8 @@ SharedAllocationRecord::SharedAllocationRecord( sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_tex_obj(0), m_space(arg_space) { - // Fill in the Header information, directly accessible via UVM - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } SharedAllocationRecord:: @@ -639,7 +561,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -648,319 +570,13 @@ SharedAllocationRecord:: arg_alloc_size), sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { - // Fill in the Header information, directly accessible on the host - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(*base_t::m_alloc_ptr, + arg_label); } // end SharedAllocationRecord constructors }}}1 //============================================================================== -//============================================================================== -// {{{1 - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void *SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaUVMSpace &arg_space, const std::string &arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked( - void *const arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void *SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void * -SharedAllocationRecord::allocate_tracked( - const Kokkos::CudaHostPinnedSpace &arg_space, - const std::string &arg_alloc_label, const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord *const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void *const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord *const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void * -SharedAllocationRecord::reallocate_tracked( - void *const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord *const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord *const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -// end SharedAllocationRecored::(re|de|)allocate_tracked }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -SharedAllocationRecord * -SharedAllocationRecord::get_record(void *alloc_ptr) { - using RecordCuda = SharedAllocationRecord; - - using Header = SharedAllocationHeader; - - // Copy the header from the allocation - Header head; - - Header const *const head_cuda = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_cuda, sizeof(SharedAllocationHeader)); - } - - RecordCuda *const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_cuda) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< Kokkos::CudaSpace , " - "void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord *SharedAllocationRecord< - Kokkos::CudaUVMSpace, void>::get_record(void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaUVMSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -SharedAllocationRecord - *SharedAllocationRecord::get_record( - void *alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordCuda = SharedAllocationRecord; - - Header *const h = - alloc_ptr ? reinterpret_cast
(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception( - std::string("Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::CudaHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// end SharedAllocationRecord::get_record() }}}1 -//============================================================================== - -//============================================================================== -// {{{1 - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord *r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "Cuda addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx " - "+ %.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "Cuda addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char *format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "Cuda [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "Cuda [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "Cuda [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaUVMSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaUVM", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -void SharedAllocationRecord::print_records( - std::ostream &s, const Kokkos::CudaHostPinnedSpace &, bool detail) { - (void)s; - (void)detail; -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord::print_host_accessible_records( - s, "CudaHostPinned", &s_root_record, detail); -#else - Kokkos::Impl::throw_runtime_exception( - "SharedAllocationHeader::print_records only works with " - "KOKKOS_ENABLE_DEBUG enabled"); -#endif -} - -// end SharedAllocationRecord::print_records() }}}1 -//============================================================================== - void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, bool to_device) { if ((ptr == nullptr) || (bytes == 0)) return; @@ -984,6 +600,29 @@ void cuda_prefetch_pointer(const Cuda &space, const void *ptr, size_t bytes, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class SharedAllocationRecordCommon; +template class HostInaccessibleSharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== + #else void KOKKOS_CORE_SRC_CUDA_CUDASPACE_PREVENT_LINK_ERROR() {} #endif // KOKKOS_ENABLE_CUDA diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp index 0d6d3bdb3a..0f4259072d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_BlockSize_Deduction.hpp @@ -140,7 +140,7 @@ inline int cuda_deduce_block_size(bool early_termination, } } - if (early_termination && blocks_per_sm != 0) break; + if (early_termination && opt_block_size != 0) break; } return opt_block_size; @@ -222,7 +222,8 @@ inline size_t get_shmem_per_sm_prefer_l1(cudaDeviceProp const& properties) { case 52: case 61: return 96; case 70: - case 80: return 8; + case 80: + case 86: return 8; case 75: return 32; default: Kokkos::Impl::throw_runtime_exception( diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp index a9a62380e5..ec9c434fe6 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Half.hpp @@ -175,30 +175,42 @@ class half_t { return cast_from_half(*this); } + /** + * Conversion constructors. + * + * Support implicit conversions from impl_type, float, double -> half_t + * Mixed precision expressions require upcasting which is done in the + * "// Binary Arithmetic" operator overloads below. + * + * Support implicit conversions from integral types -> half_t. + * Expressions involving half_t with integral types require downcasting + * the integral types to half_t. Existing operator overloads can handle this + * with the addition of the below implicit conversion constructors. + */ KOKKOS_FUNCTION half_t(impl_type rhs) : val(rhs) {} KOKKOS_FUNCTION - explicit half_t(float rhs) : val(cast_to_half(rhs).val) {} + half_t(float rhs) : val(cast_to_half(rhs).val) {} + KOKKOS_FUNCTION + half_t(double rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION explicit half_t(bool rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(double rhs) : val(cast_to_half(rhs).val) {} + half_t(short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(short rhs) : val(cast_to_half(rhs).val) {} + half_t(int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(int rhs) : val(cast_to_half(rhs).val) {} + half_t(long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long rhs) : val(cast_to_half(rhs).val) {} + half_t(long long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(long long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned short rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned int rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} KOKKOS_FUNCTION - explicit half_t(unsigned long rhs) : val(cast_to_half(rhs).val) {} - KOKKOS_FUNCTION - explicit half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} + half_t(unsigned long long rhs) : val(cast_to_half(rhs).val) {} // Unary operators KOKKOS_FUNCTION @@ -243,7 +255,7 @@ class half_t { #else float tmp = __half2float(val); --tmp; - val = __float2half(tmp); + val = __float2half(tmp); #endif return *this; } @@ -276,88 +288,317 @@ class half_t { return *this; } + template + KOKKOS_FUNCTION void operator=(T rhs) volatile { + val = cast_to_half(rhs).val; + } + // Compound operators KOKKOS_FUNCTION half_t& operator+=(half_t rhs) { #ifdef __CUDA_ARCH__ val += rhs.val; #else - val = __float2half(__half2float(val) + __half2float(rhs.val)); + val = __float2half(__half2float(val) + __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator+=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) + rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) + + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for += + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+=(T& lhs, half_t rhs) { + lhs += static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator+=(float rhs) { + float result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator+=(double rhs) { + double result = static_cast(val) + rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator-=(half_t rhs) { #ifdef __CUDA_ARCH__ val -= rhs.val; #else - val = __float2half(__half2float(val) - __half2float(rhs.val)); + val = __float2half(__half2float(val) - __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator-=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) - rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) - + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for -= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-=(T& lhs, half_t rhs) { + lhs -= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator-=(float rhs) { + float result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator-=(double rhs) { + double result = static_cast(val) - rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator*=(half_t rhs) { #ifdef __CUDA_ARCH__ val *= rhs.val; #else - val = __float2half(__half2float(val) * __half2float(rhs.val)); + val = __float2half(__half2float(val) * __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator*=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) * rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) * + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for *= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*=(T& lhs, half_t rhs) { + lhs *= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator*=(float rhs) { + float result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator*=(double rhs) { + double result = static_cast(val) * rhs; + val = static_cast(result); + return *this; + } + KOKKOS_FUNCTION half_t& operator/=(half_t rhs) { #ifdef __CUDA_ARCH__ val /= rhs.val; #else - val = __float2half(__half2float(val) / __half2float(rhs.val)); + val = __float2half(__half2float(val) / __half2float(rhs.val)); #endif return *this; } + KOKKOS_FUNCTION + volatile half_t& operator/=(half_t rhs) volatile { +#ifdef __CUDA_ARCH__ + // Cuda 10 supports __half volatile stores but not volatile arithmetic + // operands. Cast away volatile-ness of val for arithmetic but not for store + // location. + val = const_cast(val) / rhs.val; +#else + // Use non-volatile val_ref to suppress: + // "warning: implicit dereference will not access object of type ‘volatile + // __half’ in statement" + auto val_ref = const_cast(val); + val_ref = __float2half(__half2float(const_cast(val)) / + __half2float(rhs.val)); +#endif + return *this; + } + + // Compund operators: upcast overloads for /= + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/=(T& lhs, half_t rhs) { + lhs /= static_cast(rhs); + return lhs; + } + + KOKKOS_FUNCTION + half_t& operator/=(float rhs) { + float result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + + KOKKOS_FUNCTION + half_t& operator/=(double rhs) { + double result = static_cast(val) / rhs; + val = static_cast(result); + return *this; + } + // Binary Arithmetic KOKKOS_FUNCTION half_t friend operator+(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val += rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) + __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(half_t lhs, T rhs) { + return T(lhs) + rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator+(T lhs, half_t rhs) { + return lhs + T(rhs); + } + KOKKOS_FUNCTION half_t friend operator-(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val -= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) - __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for - + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(half_t lhs, T rhs) { + return T(lhs) - rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator-(T lhs, half_t rhs) { + return lhs - T(rhs); + } + KOKKOS_FUNCTION half_t friend operator*(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val *= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) * __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for * + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(half_t lhs, T rhs) { + return T(lhs) * rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator*(T lhs, half_t rhs) { + return lhs * T(rhs); + } + KOKKOS_FUNCTION half_t friend operator/(half_t lhs, half_t rhs) { #ifdef __CUDA_ARCH__ lhs.val /= rhs.val; #else - lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); + lhs.val = __float2half(__half2float(lhs.val) / __half2float(rhs.val)); #endif return lhs; } + // Binary Arithmetic upcast operators for / + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(half_t lhs, T rhs) { + return T(lhs) / rhs; + } + + template + KOKKOS_FUNCTION std::enable_if_t< + std::is_same::value || std::is_same::value, T> friend + operator/(T lhs, half_t rhs) { + return lhs / T(rhs); + } + // Logical operators KOKKOS_FUNCTION bool operator!() const { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index b8e8163458..016cb6cdcb 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -248,11 +249,11 @@ void CudaInternal::print_configuration(std::ostream &s) const { const CudaInternalDevices &dev_info = CudaInternalDevices::singleton(); #if defined(KOKKOS_ENABLE_CUDA) - s << "macro KOKKOS_ENABLE_CUDA : defined" << std::endl; + s << "macro KOKKOS_ENABLE_CUDA : defined\n"; #endif #if defined(CUDA_VERSION) s << "macro CUDA_VERSION = " << CUDA_VERSION << " = version " - << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << std::endl; + << CUDA_VERSION / 1000 << "." << (CUDA_VERSION % 1000) / 10 << '\n'; #endif for (int i = 0; i < dev_info.m_cudaDevCount; ++i) { @@ -274,7 +275,6 @@ CudaInternal::~CudaInternal() { m_scratchConcurrentBitset) { std::cerr << "Kokkos::Cuda ERROR: Failed to call Kokkos::Cuda::finalize()" << std::endl; - std::cerr.flush(); } m_cudaDev = -1; @@ -358,8 +358,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (m_cudaArch == 0) { std::stringstream ss; - ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture" - << std::endl; + ss << "Kokkos::Cuda::initialize ERROR: likely mismatch of architecture\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -373,7 +372,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { "compute capability " << compiled_major << "." << compiled_minor << " on device with compute capability " << cudaProp.major << "." - << cudaProp.minor << " is not supported by CUDA!" << std::endl; + << cudaProp.minor << " is not supported by CUDA!\n"; std::string msg = ss.str(); Kokkos::abort(msg.c_str()); } @@ -458,7 +457,7 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { Kokkos::Impl::SharedAllocationRecord; Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchBitset", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -492,17 +491,11 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { #ifdef KOKKOS_ENABLE_CUDA_UVM if (Kokkos::show_warnings() && !cuda_launch_blocking()) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" - << std::endl; - std::cerr << " without setting " - "CUDA_LAUNCH_BLOCKING=1." - << std::endl; - std::cerr << " The code must call " - "Cuda().fence() after each kernel" - << std::endl; - std::cerr << " or will likely crash when " - "accessing data on the host." + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_LAUNCH_BLOCKING=1. + The code must call Cuda().fence() after each kernel + or will likely crash when accessing data on the host.)warning" << std::endl; } @@ -520,19 +513,13 @@ void CudaInternal::initialize(int cuda_device_id, cudaStream_t stream) { if (Kokkos::show_warnings() && (!visible_devices_one && !force_device_alloc)) { - std::cerr << "Kokkos::Cuda::initialize WARNING: Cuda is allocating into " - "UVMSpace by default" + std::cerr << R"warning( +Kokkos::Cuda::initialize WARNING: Cuda is allocating into UVMSpace by default + without setting CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or + setting CUDA_VISIBLE_DEVICES. + This could on multi GPU systems lead to severe performance" + penalties.)warning" << std::endl; - std::cerr << " without setting " - "CUDA_MANAGED_FORCE_DEVICE_ALLOC=1 or " - << std::endl; - std::cerr - << " setting CUDA_VISIBLE_DEVICES." - << std::endl; - std::cerr << " This could on multi GPU " - "systems lead to severe performance" - << std::endl; - std::cerr << " penalties." << std::endl; } #endif @@ -575,7 +562,7 @@ Cuda::size_type *CudaInternal::scratch_flags(const Cuda::size_type size) const { if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchFlags", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFlags", (sizeof(ScratchGrain) * m_scratchFlagsCount)); Record::increment(r); @@ -600,7 +587,7 @@ Cuda::size_type *CudaInternal::scratch_space(const Cuda::size_type size) const { if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); Record *const r = - Record::allocate(Kokkos::CudaSpace(), "InternalScratchSpace", + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchSpace", (sizeof(ScratchGrain) * m_scratchSpaceCount)); Record::increment(r); @@ -624,7 +611,7 @@ Cuda::size_type *CudaInternal::scratch_unified( Record::decrement(Record::get_record(m_scratchUnified)); Record *const r = Record::allocate( - Kokkos::CudaHostPinnedSpace(), "InternalScratchUnified", + Kokkos::CudaHostPinnedSpace(), "Kokkos::InternalScratchUnified", (sizeof(ScratchGrain) * m_scratchUnifiedCount)); Record::increment(r); @@ -646,8 +633,9 @@ Cuda::size_type *CudaInternal::scratch_functor( if (m_scratchFunctor) Record::decrement(Record::get_record(m_scratchFunctor)); - Record *const r = Record::allocate( - Kokkos::CudaSpace(), "InternalScratchFunctor", m_scratchFunctorSize); + Record *const r = + Record::allocate(Kokkos::CudaSpace(), "Kokkos::InternalScratchFunctor", + m_scratchFunctorSize); Record::increment(r); @@ -662,7 +650,7 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "CudaSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::CudaSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -676,6 +664,9 @@ void *CudaInternal::resize_team_scratch_space(std::int64_t bytes, //---------------------------------------------------------------------------- void CudaInternal::finalize() { + // skip if finalize() has already been called + if (was_finalized) return; + was_finalized = true; if (nullptr != m_scratchSpace || nullptr != m_scratchFlags) { // Only finalize this if we're the singleton @@ -719,6 +710,11 @@ void CudaInternal::finalize() { if (this == &singleton()) { cudaFreeHost(constantMemHostStaging); cudaEventDestroy(constantMemReusable); + auto &deep_copy_space = + Kokkos::Impl::cuda_get_deep_copy_space(/*initialize*/ false); + if (deep_copy_space) + deep_copy_space->impl_internal_space_instance()->finalize(); + cudaStreamDestroy(cuda_get_deep_copy_stream()); } } @@ -821,62 +817,23 @@ Cuda::size_type Cuda::device_arch() { void Cuda::impl_finalize() { Impl::CudaInternal::singleton().finalize(); } Cuda::Cuda() - : m_space_instance(&Impl::CudaInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::CudaInternal::singleton(), + [](Impl::CudaInternal *) {}) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); } Cuda::Cuda(cudaStream_t stream) - : m_space_instance(new Impl::CudaInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::CudaInternal, [](Impl::CudaInternal *ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::CudaInternal::singleton().verify_is_initialized( "Cuda instance constructor"); m_space_instance->initialize(Impl::CudaInternal::singleton().m_cudaDev, stream); } -KOKKOS_FUNCTION Cuda::Cuda(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION Cuda::Cuda(const Cuda &other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(Cuda &&other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - return *this; -} - -KOKKOS_FUNCTION Cuda &Cuda::operator=(const Cuda &other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - return *this; -} - -KOKKOS_FUNCTION Cuda::~Cuda() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void Cuda::print_configuration(std::ostream &s, const bool) { Impl::CudaInternal::singleton().print_configuration(s); } @@ -924,54 +881,53 @@ void CudaSpaceInitializer::fence() { Kokkos::Cuda::impl_static_fence(); } void CudaSpaceInitializer::print_configuration(std::ostream &msg, const bool detail) { - msg << "Device Execution Space:" << std::endl; - msg << " KOKKOS_ENABLE_CUDA: "; - msg << "yes" << std::endl; + msg << "Device Execution Space:\n"; + msg << " KOKKOS_ENABLE_CUDA: yes\n"; - msg << "Cuda Atomics:" << std::endl; + msg << "Cuda Atomics:\n"; msg << " KOKKOS_ENABLE_CUDA_ATOMICS: "; #ifdef KOKKOS_ENABLE_CUDA_ATOMICS - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif - msg << "Cuda Options:" << std::endl; + msg << "Cuda Options:\n"; msg << " KOKKOS_ENABLE_CUDA_LAMBDA: "; #ifdef KOKKOS_ENABLE_CUDA_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; #ifdef KOKKOS_ENABLE_CUDA_LDG_INTRINSIC - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUDA_UVM: "; #ifdef KOKKOS_ENABLE_CUDA_UVM - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CUSPARSE: "; #ifdef KOKKOS_ENABLE_CUSPARSE - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; #ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - msg << "yes" << std::endl; + msg << "yes\n"; #else - msg << "no" << std::endl; + msg << "no\n"; #endif msg << "\nCuda Runtime Configuration:" << std::endl; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp index 13773d70c5..aaec2c2926 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.hpp @@ -17,30 +17,24 @@ namespace Kokkos { namespace Impl { struct CudaTraits { - enum : CudaSpace::size_type { WarpSize = 32 /* 0x0020 */ }; - enum : CudaSpace::size_type { - WarpIndexMask = 0x001f /* Mask for warpindex */ - }; - enum : CudaSpace::size_type { - WarpIndexShift = 5 /* WarpSize == 1 << WarpShift */ - }; + static constexpr CudaSpace::size_type WarpSize = 32 /* 0x0020 */; + static constexpr CudaSpace::size_type WarpIndexMask = + 0x001f; /* Mask for warpindex */ + static constexpr CudaSpace::size_type WarpIndexShift = + 5; /* WarpSize == 1 << WarpShift */ - enum : CudaSpace::size_type { - ConstantMemoryUsage = 0x008000 /* 32k bytes */ - }; - enum : CudaSpace::size_type { - ConstantMemoryCache = 0x002000 /* 8k bytes */ - }; - enum : CudaSpace::size_type { - KernelArgumentLimit = 0x001000 /* 4k bytes */ - }; - enum : CudaSpace::size_type { - MaxHierarchicalParallelism = 1024 /* team_size * vector_length */ - }; + static constexpr CudaSpace::size_type ConstantMemoryUsage = + 0x008000; /* 32k bytes */ + static constexpr CudaSpace::size_type ConstantMemoryCache = + 0x002000; /* 8k bytes */ + static constexpr CudaSpace::size_type KernelArgumentLimit = + 0x001000; /* 4k bytes */ + static constexpr CudaSpace::size_type MaxHierarchicalParallelism = + 1024; /* team_size * vector_length */ using ConstantGlobalBufferType = unsigned long[ConstantMemoryUsage / sizeof(unsigned long)]; - enum { ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */ }; + static constexpr int ConstantMemoryUseThreshold = 0x000200 /* 512 bytes */; KOKKOS_INLINE_FUNCTION static CudaSpace::size_type warp_count( CudaSpace::size_type i) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index 39404e0bf3..d892a893b3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -158,6 +158,9 @@ inline void check_shmem_request(CudaInternal const* cuda_instance, int shmem) { } } +// This function needs to be template on DriverType and LaunchBounds +// so that the static bool is unique for each type combo +// KernelFuncPtr does not necessarily contain that type information. template inline void configure_shmem_preference(KernelFuncPtr const& func, bool prefer_shmem) { @@ -355,8 +358,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); void const* args[] = {&driver}; @@ -449,8 +451,7 @@ struct CudaParallelLaunchKernelInvoker< if (!Impl::is_empty_launch(grid, block)) { Impl::check_shmem_request(cuda_instance, shmem); - Impl::configure_shmem_preference( + Impl::configure_shmem_preference( base_t::get_kernel_func(), prefer_shmem); auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); @@ -627,9 +628,8 @@ struct CudaParallelLaunchImpl< get_cuda_func_attributes(), block, shmem, prefer_shmem); Impl::configure_shmem_preference< - DriverType, Kokkos::LaunchBounds, - decltype(base_t::get_kernel_func())>(base_t::get_kernel_func(), - prefer_shmem); + DriverType, Kokkos::LaunchBounds>( + base_t::get_kernel_func(), prefer_shmem); KOKKOS_ENSURE_CUDA_LOCK_ARRAYS_ON_DEVICE(); diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp new file mode 100644 index 0000000000..12b7f70a97 --- /dev/null +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_CUDA_MDRANGEPOLICY_HPP_ +#define KOKKOS_CUDA_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Cuda& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 2; + properties.max_total_tile_size = 512; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp index 131d180980..2834e6f3de 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel.hpp @@ -60,6 +60,7 @@ #include #include #include +#include #include #include @@ -67,6 +68,7 @@ #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -474,7 +476,7 @@ class ParallelFor, Kokkos::Cuda> { Policy const& get_policy() const { return m_policy; } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -537,9 +539,23 @@ class ParallelFor, Kokkos::Cuda> { const Policy m_rp; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_rp; } - - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_rp, m_functor) .exec_range(); @@ -689,7 +705,7 @@ class ParallelFor, public: Policy const& get_policy() const { return m_policy; } - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -1248,8 +1264,21 @@ class ParallelReduce, ReducerType, using DummySHMEMReductionType = int; public: + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + cudaFuncAttributes attr = + CudaParallelLaunch::get_cuda_func_attributes(); + auto const& prop = pol.space().cuda_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast(Kokkos::Impl::CudaTraits::MaxHierarchicalParallelism)); + } Policy const& get_policy() const { return m_policy; } - inline __device__ void exec_range(reference_type update) const { Kokkos::Impl::Reduce::DeviceIterateTile, ReducerType, .exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { /* run(Kokkos::Impl::if_c::select(1,1.0) ); } @@ -2074,7 +2103,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2110,7 +2139,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2195,7 +2224,7 @@ class ParallelScan, Kokkos::Cuda> { //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; @@ -2364,7 +2393,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2400,7 +2429,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -2487,7 +2516,7 @@ class ParallelScanWithTotal, //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { #ifdef KOKKOS_IMPL_DEBUG_CUDA_SERIAL_EXECUTION if (m_run_serial) { typename ValueTraits::value_type value; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index 4b472f5d4f..e780639015 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -661,13 +661,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::CudaTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::CudaTeamMember> +ThreadVectorRange(const Impl::CudaTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -983,7 +984,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -991,25 +992,25 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::CudaTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { (void)loop_boundaries; (void)closure; + (void)reducer; #ifdef __CUDA_ARCH__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -1026,44 +1027,68 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; + value_type val = identity; - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - value_type sval = val; - - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each CUDA thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < (int)blockDim.x; j <<= 1) { - value_type tmp = 0; - Impl::in_place_shfl_up(tmp, sval, j, blockDim.x, active_mask); + value_type tmp = identity; + Impl::in_place_shfl_up(tmp, val, j, blockDim.x, active_mask); if (j <= (int)threadIdx.x) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - Impl::in_place_shfl(sval, sval, mask, blockDim.x, active_mask); - - accum += sval; + Impl::in_place_shfl(accum, val, mask, blockDim.x, active_mask); } #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp index f24abb377d..c55956ede9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_View.hpp @@ -139,7 +139,7 @@ struct CudaLDGFetch { template KOKKOS_INLINE_FUNCTION ValueType operator[](const iType& i) const { -#ifdef __CUDA_ARCH__ +#if defined(__CUDA_ARCH__) && (350 <= _CUDA_ARCH__) AliasType v = __ldg(reinterpret_cast(&m_ptr[i])); return *(reinterpret_cast(&v)); #else diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp index 05876a9f02..fc52e41514 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_WorkGraphPolicy.hpp @@ -46,6 +46,7 @@ #define KOKKOS_CUDA_WORKGRAPHPOLICY_HPP #include +#include namespace Kokkos { namespace Impl { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 89135b6c45..9278d1bdc9 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -75,17 +75,6 @@ void hipOccupancy(int *numBlocks, int blockSize, int sharedmem) { hipOccupancy( numBlocks, blockSize, sharedmem); } -template -struct HIPGetMaxBlockSize; - -template -int hip_get_max_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetMaxBlockSize::get_block_size( - f, vector_length, shmem_extra_block, shmem_extra_thread); -} template int hip_internal_get_block_size(const F &condition_check, @@ -131,10 +120,6 @@ int hip_internal_get_block_size(const F &condition_check, int opt_block_size = (blocks_per_sm >= min_blocks_per_sm) ? block_size : min_blocks_per_sm; int opt_threads_per_sm = threads_per_sm; - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; while (condition_check(blocks_per_sm) && (block_size >= HIPTraits::WarpSize)) { @@ -160,10 +145,6 @@ int hip_internal_get_block_size(const F &condition_check, opt_threads_per_sm = threads_per_sm; } } - // printf("BlockSizeMax: %i Shmem: %i %i %i %i Regs: %i %i Blocks: %i %i - // Achieved: %i %i Opt: %i %i\n",block_size, - // shmem_per_sm,max_shmem_per_block,functor_shmem,total_shmem, - // regs_per_sm,regs_per_wavefront,max_blocks_shmem,max_blocks_regs,blocks_per_sm,threads_per_sm,opt_block_size,opt_threads_per_sm); block_size -= HIPTraits::WarpSize; } return opt_block_size; @@ -178,62 +159,6 @@ int hip_get_max_block_size(const HIPInternal *hip_instance, [](int x) { return x == 0; }, hip_instance, attr, f, vector_length, shmem_block, shmem_thread); } -template -struct HIPGetMaxBlockSize { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int numBlocks = 0; - int blockSize = LaunchBounds::maxTperB == 0 ? 1024 : LaunchBounds::maxTperB; - int sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (numBlocks > 0) return blockSize; - while (blockSize > HIPTraits::WarpSize && numBlocks == 0) { - blockSize /= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - int blockSizeUpperBound = blockSize * 2; - while (blockSize < blockSizeUpperBound && numBlocks > 0) { - blockSize += HIPTraits::WarpSize; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - } - return blockSize - HIPTraits::WarpSize; - } -}; - -template -struct HIPGetOptBlockSize; - -template -int hip_get_opt_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - return HIPGetOptBlockSize< - DriverType, LaunchBounds, - (HIPTraits::ConstantMemoryUseThreshold < - sizeof(DriverType))>::get_block_size(f, vector_length, shmem_extra_block, - shmem_extra_thread); -} template int hip_get_opt_block_size(HIPInternal const *hip_instance, @@ -245,157 +170,6 @@ int hip_get_opt_block_size(HIPInternal const *hip_instance, shmem_block, shmem_thread); } -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize, true> { - static int get_block_size(typename DriverType::functor_type const &f, - size_t const vector_length, - size_t const shmem_extra_block, - size_t const shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy(&numBlocks, blockSize, sharedmem); - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -template -struct HIPGetOptBlockSize, false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - - while (blockSize < HIPTraits::MaxThreadsPerBlock) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy(&numBlocks, blockSize, sharedmem); - - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - return bestBlockSize; - } -}; - -// FIXME_HIP the code is identical to the false struct except for -// hip_parallel_launch_constant_memory -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - true> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - - // calculate the occupancy with that optBlockSize and check whether its - // larger than the largest one found so far - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= static_cast(MinBlocksPerSM) && - blockSize <= static_cast(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - -template -struct HIPGetOptBlockSize< - DriverType, Kokkos::LaunchBounds, - false> { - static int get_block_size(const typename DriverType::functor_type &f, - const size_t vector_length, - const size_t shmem_extra_block, - const size_t shmem_extra_thread) { - int blockSize = HIPTraits::WarpSize / 2; - int numBlocks; - int sharedmem; - int maxOccupancy = 0; - int bestBlockSize = 0; - int max_threads_per_block = - std::min(MaxThreadsPerBlock, - hip_internal_maximum_warp_count() * HIPTraits::WarpSize); - - while (blockSize < max_threads_per_block) { - blockSize *= 2; - sharedmem = - shmem_extra_block + shmem_extra_thread * (blockSize / vector_length) + - ::Kokkos::Impl::FunctorTeamShmemSize< - typename DriverType::functor_type>::value(f, blockSize / - vector_length); - - hipOccupancy( - &numBlocks, blockSize, sharedmem); - if (numBlocks >= int(MinBlocksPerSM) && - blockSize <= int(MaxThreadsPerBlock)) { - if (maxOccupancy < numBlocks * blockSize) { - maxOccupancy = numBlocks * blockSize; - bestBlockSize = blockSize; - } - } - } - if (maxOccupancy > 0) return bestBlockSize; - return -1; - } -}; - } // namespace Impl } // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index 45512038ac..18ef10e22c 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -164,6 +164,8 @@ HIPInternal &HIPInternal::singleton() { void HIPInternal::fence() const { HIP_SAFE_CALL(hipStreamSynchronize(m_stream)); + // can reset our cycle id now as well + m_cycleId = 0; } void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { @@ -256,7 +258,7 @@ void HIPInternal::initialize(int hip_device_id, hipStream_t stream) { void>; Record *const r = Record::allocate(Kokkos::Experimental::HIPSpace(), - "InternalScratchBitset", + "Kokkos::InternalScratchBitset", sizeof(uint32_t) * buffer_bound); Record::increment(r); @@ -303,8 +305,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_space( Kokkos::Impl::SharedAllocationRecord; - static Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchSpace", + if (m_scratchSpace) Record::decrement(Record::get_record(m_scratchSpace)); + + Record *const r = Record::allocate( + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchSpace", (sizeScratchGrain * m_scratchSpaceCount)); Record::increment(r); @@ -325,8 +329,10 @@ Kokkos::Experimental::HIP::size_type *HIPInternal::scratch_flags( Kokkos::Impl::SharedAllocationRecord; + if (m_scratchFlags) Record::decrement(Record::get_record(m_scratchFlags)); + Record *const r = Record::allocate( - Kokkos::Experimental::HIPSpace(), "InternalScratchFlags", + Kokkos::Experimental::HIPSpace(), "Kokkos::InternalScratchFlags", (sizeScratchGrain * m_scratchFlagsCount)); Record::increment(r); @@ -345,7 +351,7 @@ void *HIPInternal::resize_team_scratch_space(std::int64_t bytes, if (m_team_scratch_current_size == 0) { m_team_scratch_current_size = bytes; m_team_scratch_ptr = Kokkos::kokkos_malloc( - "HIPSpace::ScratchMemory", m_team_scratch_current_size); + "Kokkos::HIPSpace::TeamScratchMemory", m_team_scratch_current_size); } if ((bytes > m_team_scratch_current_size) || ((bytes < m_team_scratch_current_size) && (force_shrink))) { @@ -388,6 +394,40 @@ void HIPInternal::finalize() { m_team_scratch_current_size = 0; m_team_scratch_ptr = nullptr; } + if (nullptr != d_driverWorkArray) { + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + d_driverWorkArray = nullptr; + } +} + +char *HIPInternal::get_next_driver(size_t driverTypeSize) const { + std::lock_guard const lock(m_mutexWorkArray); + if (d_driverWorkArray == nullptr) { + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } + if (driverTypeSize > m_maxDriverTypeSize) { + // fence handles the cycle id reset for us + fence(); + HIP_SAFE_CALL(hipHostFree(d_driverWorkArray)); + m_maxDriverTypeSize = driverTypeSize; + if (m_maxDriverTypeSize % 128 != 0) + m_maxDriverTypeSize = + m_maxDriverTypeSize + 128 - m_maxDriverTypeSize % 128; + HIP_SAFE_CALL( + hipHostMalloc(&d_driverWorkArray, + m_maxDriverCycles * m_maxDriverTypeSize * sizeof(char), + hipHostMallocNonCoherent)); + } else { + m_cycleId = (m_cycleId + 1) % m_maxDriverCycles; + if (m_cycleId == 0) { + // ensure any outstanding kernels are completed before we wrap around + fence(); + } + } + return &d_driverWorkArray[m_maxDriverTypeSize * m_cycleId]; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 07ec8625e6..f4f88628e3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -49,6 +49,8 @@ #include +#include + namespace Kokkos { namespace Experimental { namespace Impl { @@ -83,33 +85,46 @@ class HIPInternal { public: using size_type = ::Kokkos::Experimental::HIP::size_type; - int m_hipDev; - int m_hipArch; - unsigned m_multiProcCount; - unsigned m_maxWarpCount; - unsigned m_maxBlock; - unsigned m_maxBlocksPerSM; - unsigned m_maxSharedWords; + int m_hipDev = -1; + int m_hipArch = -1; + unsigned m_multiProcCount = 0; + unsigned m_maxWarpCount = 0; + unsigned m_maxBlock = 0; + unsigned m_maxBlocksPerSM = 0; + unsigned m_maxSharedWords = 0; int m_regsPerSM; - int m_shmemPerSM; - int m_maxShmemPerBlock; - int m_maxThreadsPerSM; + int m_shmemPerSM = 0; + int m_maxShmemPerBlock = 0; + int m_maxThreadsPerSM = 0; + + // array of DriverTypes to be allocated in host-pinned memory for async + // kernel launches + mutable char *d_driverWorkArray = nullptr; + // number of kernel launches that can be in-flight w/o synchronization + const int m_maxDriverCycles = 100; + // max size of a DriverType [bytes] + mutable size_t m_maxDriverTypeSize = 1024 * 10; + // the current index in the driverWorkArray + mutable int m_cycleId = 0; + // mutex to access d_driverWorkArray + mutable std::mutex m_mutexWorkArray; // Scratch Spaces for Reductions - size_type m_scratchSpaceCount; - size_type m_scratchFlagsCount; + size_type m_scratchSpaceCount = 0; + size_type m_scratchFlagsCount = 0; - size_type *m_scratchSpace; - size_type *m_scratchFlags; + size_type *m_scratchSpace = nullptr; + size_type *m_scratchFlags = nullptr; uint32_t *m_scratchConcurrentBitset = nullptr; hipDeviceProp_t m_deviceProp; - hipStream_t m_stream; + hipStream_t m_stream = nullptr; // Team Scratch Level 1 Space - mutable int64_t m_team_scratch_current_size; - mutable void *m_team_scratch_ptr; + mutable int64_t m_team_scratch_current_size = 0; + mutable void *m_team_scratch_ptr = nullptr; + mutable std::mutex m_team_scratch_mutex; bool was_finalized = false; @@ -117,9 +132,7 @@ class HIPInternal { int verify_is_initialized(const char *const label) const; - int is_initialized() const { - return m_hipDev >= 0; - } // 0 != m_scratchSpace && 0 != m_scratchFlags ; } + int is_initialized() const { return m_hipDev >= 0; } void initialize(int hip_device_id, hipStream_t stream = nullptr); void finalize(); @@ -128,25 +141,12 @@ class HIPInternal { void fence() const; + // returns the next driver type pointer in our work array + char *get_next_driver(size_t driverTypeSize) const; + ~HIPInternal(); - HIPInternal() - : m_hipDev(-1), - m_hipArch(-1), - m_multiProcCount(0), - m_maxWarpCount(0), - m_maxBlock(0), - m_maxSharedWords(0), - m_shmemPerSM(0), - m_maxShmemPerBlock(0), - m_maxThreadsPerSM(0), - m_scratchSpaceCount(0), - m_scratchFlagsCount(0), - m_scratchSpace(nullptr), - m_scratchFlags(nullptr), - m_stream(nullptr), - m_team_scratch_current_size(0), - m_team_scratch_ptr(nullptr) {} + HIPInternal() = default; // Resizing of reduction related scratch spaces size_type *scratch_space(const size_type size); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 3e972c7346..f774423b37 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -49,9 +49,9 @@ #if defined(__HIPCC__) -#include #include #include +#include // Must use global variable on the device with HIP-Clang #ifdef __HIP__ @@ -127,19 +127,69 @@ struct HIPDispatchProperties { HIPLaunchMechanism launch_mechanism = l; }; -template , +template +struct HIPParallelLaunchKernelFunc; + +template +struct HIPParallelLaunchKernelFunc< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; + } +}; + +template +struct HIPParallelLaunchKernelFunc, + HIPLaunchMechanism::LocalMemory> { + static auto get_kernel_func() { + return hip_parallel_launch_local_memory; + } +}; + +template +struct HIPParallelLaunchKernelInvoker; + +template +struct HIPParallelLaunchKernelInvoker + : HIPParallelLaunchKernelFunc { + using base_t = HIPParallelLaunchKernelFunc; + + static void invoke_kernel(DriverType const *driver, dim3 const &grid, + dim3 const &block, int shmem, + HIPInternal const *hip_instance) { + (base_t::get_kernel_func())<<m_stream>>>( + driver); + } +}; + +template , HIPLaunchMechanism LaunchMechanism = HIPLaunchMechanism::LocalMemory> struct HIPParallelLaunch; -template struct HIPParallelLaunch< DriverType, Kokkos::LaunchBounds, - HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { + HIPLaunchMechanism::LocalMemory> + : HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory> { + using base_t = HIPParallelLaunchKernelInvoker< + DriverType, Kokkos::LaunchBounds, + HIPLaunchMechanism::LocalMemory>; + + HIPParallelLaunch(const DriverType &driver, const dim3 &grid, + const dim3 &block, const int shmem, + const HIPInternal *hip_instance, + const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { if (hip_instance->m_maxShmemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( @@ -148,72 +198,16 @@ struct HIPParallelLaunch< KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); - // FIXME_HIP -- there is currently an error copying (some) structs - // by value to the device in HIP-Clang / VDI - // As a workaround, we can malloc the DriverType and explictly copy over. - // To remove once solved in HIP - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - HIP_SAFE_CALL(hipGetLastError()); - hip_instance->fence(); -#endif - HIP_SAFE_CALL(hipFree(d_driver)); - } - } - - static hipFuncAttributes get_hip_func_attributes() { - static hipFuncAttributes attr = []() { - hipFuncAttributes attr; - HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, - reinterpret_cast( - hip_parallel_launch_local_memory))); - return attr; - }(); - return attr; - } -}; - -template -struct HIPParallelLaunch, - HIPLaunchMechanism::LocalMemory> { - inline HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, - const HIPInternal *hip_instance, - const bool /*prefer_shmem*/) { - if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { - Kokkos::Impl::throw_runtime_exception(std::string( - "HIPParallelLaunch FAILED: shared memory request is too large")); - } - - KOKKOS_ENSURE_HIP_LOCK_ARRAYS_ON_DEVICE(); - // Invoke the driver function on the device - - // FIXME_HIP -- see note about struct copy by value above - DriverType *d_driver; - HIP_SAFE_CALL(hipMalloc(&d_driver, sizeof(DriverType))); - HIP_SAFE_CALL(hipMemcpyAsync(d_driver, &driver, sizeof(DriverType), - hipMemcpyHostToDevice, - hip_instance->m_stream)); - hip_parallel_launch_local_memory - <<m_stream>>>(d_driver); + DriverType *d_driver = reinterpret_cast( + hip_instance->get_next_driver(sizeof(DriverType))); + std::memcpy((void *)d_driver, (void *)&driver, sizeof(DriverType)); + base_t::invoke_kernel(d_driver, grid, block, shmem, hip_instance); #if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) HIP_SAFE_CALL(hipGetLastError()); hip_instance->fence(); #endif - HIP_SAFE_CALL(hipFree(d_driver)); } } @@ -221,8 +215,7 @@ struct HIPParallelLaunch, static hipFuncAttributes attr = []() { hipFuncAttributes attr; HIP_SAFE_CALL(hipFuncGetAttributes( - &attr, reinterpret_cast( - hip_parallel_launch_local_memory))); + &attr, reinterpret_cast(base_t::get_kernel_func()))); return attr; }(); return attr; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp new file mode 100644 index 0000000000..ce1aff9586 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_MDRangePolicy.hpp @@ -0,0 +1,37 @@ +#ifndef KOKKOS_HIP_MDRANGEPOLICY_HPP_ +#define KOKKOS_HIP_MDRANGEPOLICY_HPP_ + +#include + +namespace Kokkos { + +template <> +struct default_outer_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +template <> +struct default_inner_direction { + using type = Iterate; + static constexpr Iterate value = Iterate::Left; +}; + +namespace Impl { + +// Settings for MDRangePolicy +template <> +inline TileSizeProperties get_tile_size_properties( + const Kokkos::Experimental::HIP& space) { + TileSizeProperties properties; + properties.max_threads = + space.impl_internal_space_instance()->m_maxThreadsPerSM; + properties.default_largest_tile_size = 16; + properties.default_tile_size = 4; + properties.max_total_tile_size = 1024; + return properties; +} + +} // Namespace Impl +} // Namespace Kokkos +#endif diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp index 6b831ff7a3..35e7d6fb85 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_MDRange.hpp @@ -49,6 +49,7 @@ #include #include #include +#include #include namespace Kokkos { @@ -72,7 +73,7 @@ class ParallelFor, ParallelFor& operator=(ParallelFor const&) = delete; public: - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { Kokkos::Impl::DeviceIterateTile(m_policy, m_functor) @@ -175,6 +176,25 @@ class ParallelFor, ParallelFor(FunctorType const& arg_functor, Policy const& arg_policy) : m_functor(arg_functor), m_policy(arg_policy) {} + + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelFor, + Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due to registers/SM, MDRange doesn't have + // shared memory constraints + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; // ParallelReduce @@ -231,7 +251,7 @@ class ParallelReduce, ReducerType, DeviceIteratePattern(m_policy, m_functor, update).exec_range(); } - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const integral_nonzero_constant word_count(ValueTraits::value_size( @@ -291,13 +311,19 @@ class ParallelReduce, ReducerType, ::Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock; int shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - ::Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = ::Kokkos::Impl::hip_single_inter_block_reduce_scan_shmem< false, FunctorType, WorkTag>(f, n); @@ -391,6 +417,23 @@ class ParallelReduce, ReducerType, memory_space>::accessible), m_scratch_space(nullptr), m_scratch_flags(nullptr) {} + template + static int max_tile_size_product(const Policy& pol, const Functor&) { + using closure_type = + ParallelReduce, + ReducerType, Kokkos::Experimental::HIP>; + hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); + auto const& prop = pol.space().hip_device_prop(); + // Limits due do registers/SM + int const regs_per_sm = prop.regsPerMultiprocessor; + int const regs_per_thread = attr.numRegs; + int const max_threads_per_sm = regs_per_sm / regs_per_thread; + return std::min( + max_threads_per_sm, + static_cast( + Kokkos::Experimental::Impl::HIPTraits::MaxThreadsPerBlock)); + } }; } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp index 5607f1c91a..7d2825eeb4 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Range.hpp @@ -92,7 +92,7 @@ class ParallelFor, public: using functor_type = FunctorType; - inline __device__ void operator()(void) const { + inline __device__ void operator()() const { const Member work_stride = blockDim.y * gridDim.x; const Member work_end = m_policy.end(); @@ -174,11 +174,14 @@ class ParallelReduce, ReducerType, size_type* m_scratch_space = nullptr; size_type* m_scratch_flags = nullptr; - // FIXME_HIP_PERFORMANCE Need a rule to choose when to use shared memory and - // when to use shuffle +#if HIP_VERSION < 401 static bool constexpr UseShflReduction = ((sizeof(value_type) > 2 * sizeof(double)) && static_cast(ValueTraits::StaticValueSize)); +#else + static bool constexpr UseShflReduction = + static_cast(ValueTraits::StaticValueSize); +#endif private: struct ShflReductionTag {}; @@ -330,13 +333,19 @@ class ParallelReduce, ReducerType, int shmem_size = hip_single_inter_block_reduce_scan_shmem( f, n); + using closure_type = Impl::ParallelReduce; + hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< + closure_type, LaunchBounds>::get_hip_func_attributes(); while ( (n && (m_policy.space().impl_internal_space_instance()->m_maxShmemPerBlock < shmem_size)) || - (n > static_cast( - Kokkos::Experimental::Impl::hip_get_max_block_size< - ParallelReduce, LaunchBounds>(f, 1, shmem_size, 0)))) { + (n > + static_cast( + ::Kokkos::Experimental::Impl::hip_get_max_block_size( + m_policy.space().impl_internal_space_instance(), attr, f, 1, + shmem_size, 0)))) { n >>= 1; shmem_size = hip_single_inter_block_reduce_scan_shmem( @@ -493,7 +502,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void initial(void) const { + __device__ inline void initial() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -529,7 +538,7 @@ class ParallelScanHIPBase { //---------------------------------------- - __device__ inline void final(void) const { + __device__ inline void final() const { const integral_nonzero_constant word_count(ValueTraits::value_size(m_functor) / sizeof(size_type)); @@ -606,7 +615,7 @@ class ParallelScanHIPBase { public: //---------------------------------------- - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { if (!m_final) { initial(); } else { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp index 5da83d289e..96c3ff2a75 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Parallel_Team.hpp @@ -433,6 +433,9 @@ class ParallelFor, int m_shmem_size; void* m_scratch_ptr[2]; int m_scratch_size[2]; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -449,7 +452,7 @@ class ParallelFor, } public: - __device__ inline void operator()(void) const { + __device__ inline void operator()() const { // Iterate this block through the league int64_t threadid = 0; if (m_scratch_size[1] > 0) { @@ -513,7 +516,10 @@ class ParallelFor, m_policy(arg_policy), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = ::Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelFor, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -640,6 +646,9 @@ class ParallelReduce, const size_type m_league_size; int m_team_size; const size_type m_vector_size; + // Only let one ParallelFor/Reduce modify the team scratch memory. The + // constructor acquires the mutex which is released in the destructor. + std::unique_lock m_scratch_lock; template __device__ inline @@ -877,7 +886,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = @@ -976,7 +988,10 @@ class ParallelReduce, m_scratch_ptr{nullptr, nullptr}, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), - m_vector_size(arg_policy.impl_vector_length()) { + m_vector_size(arg_policy.impl_vector_length()), + m_scratch_lock(m_policy.space() + .impl_internal_space_instance() + ->m_team_scratch_mutex) { hipFuncAttributes attr = Kokkos::Experimental::Impl::HIPParallelLaunch< ParallelReduce, launch_bounds>::get_hip_func_attributes(); m_team_size = diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 00cef28f82..15ca089d14 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -42,12 +42,6 @@ //@HEADER */ -#include -#include -#include -#include -#include -#include #include #include @@ -57,6 +51,13 @@ #include #include +#include +#include +#include +#include +#include +#include + /*--------------------------------------------------------------------------*/ /*--------------------------------------------------------------------------*/ namespace Kokkos { @@ -172,14 +173,14 @@ void DeepCopyAsyncHIP(void* dst, void const* src, size_t n) { namespace Kokkos { -void Experimental::HIPSpace::access_error() { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error() { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); Kokkos::Impl::throw_runtime_exception(msg); } -void Experimental::HIPSpace::access_error(const void* const) { +KOKKOS_DEPRECATED void Experimental::HIPSpace::access_error(const void* const) { const std::string msg( "Kokkos::Experimental::HIPSpace::access_error attempt to execute " "Experimental::HIP function from non-HIP space"); @@ -326,45 +327,6 @@ SharedAllocationRecord SharedAllocationRecord< Kokkos::Experimental::HIPHostPinnedSpace, void>::s_root_record; #endif -std::string SharedAllocationRecord::get_label() const { - SharedAllocationHeader header; - - Kokkos::Impl::DeepCopy( - &header, RecordBase::head(), sizeof(SharedAllocationHeader)); - - return std::string(header.m_label); -} - -std::string SharedAllocationRecord::get_label() const { - return std::string(RecordBase::head()->m_label); -} - -SharedAllocationRecord* -SharedAllocationRecord::allocate( - const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -SharedAllocationRecord* -SharedAllocationRecord:: - allocate(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size) { - return new SharedAllocationRecord(arg_space, arg_label, arg_alloc_size); -} - -void SharedAllocationRecord::deallocate( - SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - -void SharedAllocationRecord:: - deallocate(SharedAllocationRecord* arg_rec) { - delete static_cast(arg_rec); -} - SharedAllocationRecord::~SharedAllocationRecord() { const char* label = nullptr; @@ -393,7 +355,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -405,13 +367,7 @@ SharedAllocationRecord:: SharedAllocationHeader header; - // Fill in the Header information - header.m_record = static_cast*>(this); - - strncpy(header.m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - header.m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; + this->base_t::_fill_host_accessible_header_info(header, arg_label); // Copy to device memory Kokkos::Impl::DeepCopy( @@ -425,7 +381,7 @@ SharedAllocationRecord:: const SharedAllocationRecord::function_type arg_dealloc) // Pass through allocated [ SharedAllocationHeader , user_memory ] // Pass through deallocation function - : SharedAllocationRecord( + : base_t( #ifdef KOKKOS_ENABLE_DEBUG &SharedAllocationRecord::s_root_record, @@ -435,223 +391,8 @@ SharedAllocationRecord:: sizeof(SharedAllocationHeader) + arg_alloc_size, arg_dealloc), m_space(arg_space) { // Fill in the Header information, directly accessible via host pinned memory - - RecordBase::m_alloc_ptr->m_record = this; - - strncpy(RecordBase::m_alloc_ptr->m_label, arg_label.c_str(), - SharedAllocationHeader::maximum_label_length); - // Set last element zero, in case c_str is too long - RecordBase::m_alloc_ptr - ->m_label[SharedAllocationHeader::maximum_label_length - 1] = (char)0; -} - -//---------------------------------------------------------------------------- - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr != nullptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -void* SharedAllocationRecord:: - allocate_tracked(const Kokkos::Experimental::HIPHostPinnedSpace& arg_space, - const std::string& arg_alloc_label, - const size_t arg_alloc_size) { - if (!arg_alloc_size) return nullptr; - - SharedAllocationRecord* const r = - allocate(arg_space, arg_alloc_label, arg_alloc_size); - - RecordBase::increment(r); - - return r->data(); -} - -void SharedAllocationRecord::deallocate_tracked(void* const - arg_alloc_ptr) { - if (arg_alloc_ptr) { - SharedAllocationRecord* const r = get_record(arg_alloc_ptr); - - RecordBase::decrement(r); - } -} - -void* SharedAllocationRecord:: - reallocate_tracked(void* const arg_alloc_ptr, const size_t arg_alloc_size) { - SharedAllocationRecord* const r_old = get_record(arg_alloc_ptr); - SharedAllocationRecord* const r_new = - allocate(r_old->m_space, r_old->get_label(), arg_alloc_size); - - using HIPHostPinnedSpace = Kokkos::Experimental::HIPHostPinnedSpace; - Kokkos::Impl::DeepCopy( - r_new->data(), r_old->data(), std::min(r_old->size(), r_new->size())); - - RecordBase::increment(r_new); - RecordBase::decrement(r_old); - - return r_new->data(); -} - -//---------------------------------------------------------------------------- - -SharedAllocationRecord* -SharedAllocationRecord::get_record( - void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - // Copy the header from the allocation - Header head; - - Header const* const head_hip = - alloc_ptr ? Header::get_header(alloc_ptr) : nullptr; - - if (alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, head_hip, sizeof(SharedAllocationHeader)); - } - - RecordHIP* const record = - alloc_ptr ? static_cast(head.m_record) : nullptr; - - if (!alloc_ptr || record->m_alloc_ptr != head_hip) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< Kokkos::Experimental::HIPSpace " - ", void >::get_record ERROR")); - } - - return record; -} - -SharedAllocationRecord* -SharedAllocationRecord::get_record(void* alloc_ptr) { - using Header = SharedAllocationHeader; - using RecordHIP = - SharedAllocationRecord; - - Header* const h = - alloc_ptr ? reinterpret_cast(alloc_ptr) - 1 : nullptr; - - if (!alloc_ptr || h->m_record->m_alloc_ptr != h) { - Kokkos::Impl::throw_runtime_exception(std::string( - "Kokkos::Impl::SharedAllocationRecord< " - "Kokkos::Experimental::HIPHostPinnedSpace , void >::get_record ERROR")); - } - - return static_cast(h->m_record); -} - -// Iterate records to print orphaned memory ... -void SharedAllocationRecord:: - print_records(std::ostream& s, const Kokkos::Experimental::HIPSpace&, - bool detail) { -#ifdef KOKKOS_ENABLE_DEBUG - SharedAllocationRecord* r = &s_root_record; - - char buffer[256]; - - SharedAllocationHeader head; - - if (detail) { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - } else { - head.m_label[0] = 0; - } - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = - "HIP addr( 0x%.12lx ) list( 0x%.12lx 0x%.12lx ) extent[ 0x%.12lx + " - "%.8ld ] count(%d) dealloc(0x%.12lx) %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = - "HIP addr( 0x%.12llx ) list( 0x%.12llx 0x%.12llx ) extent[ " - "0x%.12llx + %.8ld ] count(%d) dealloc(0x%.12llx) %s\n"; - } - - snprintf(buffer, 256, format_string, reinterpret_cast(r), - reinterpret_cast(r->m_prev), - reinterpret_cast(r->m_next), - reinterpret_cast(r->m_alloc_ptr), r->m_alloc_size, - r->m_count, reinterpret_cast(r->m_dealloc), - head.m_label); - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } else { - do { - if (r->m_alloc_ptr) { - Kokkos::Impl::DeepCopy( - &head, r->m_alloc_ptr, sizeof(SharedAllocationHeader)); - - // Formatting dependent on sizeof(uintptr_t) - const char* format_string; - - if (sizeof(uintptr_t) == sizeof(unsigned long)) { - format_string = "HIP [ 0x%.12lx + %ld ] %s\n"; - } else if (sizeof(uintptr_t) == sizeof(unsigned long long)) { - format_string = "HIP [ 0x%.12llx + %ld ] %s\n"; - } - - snprintf(buffer, 256, format_string, - reinterpret_cast(r->data()), r->size(), - head.m_label); - } else { - snprintf(buffer, 256, "HIP [ 0 + 0 ]\n"); - } - s << buffer; - r = r->m_next; - } while (r != &s_root_record); - } -#else - (void)s; - (void)detail; - throw_runtime_exception( - "Kokkos::Impl::SharedAllocationRecord::print_records" - " only works with KOKKOS_ENABLE_DEBUG enabled"); -#endif + this->base_t::_fill_host_accessible_header_info(*RecordBase::m_alloc_ptr, + arg_label); } } // namespace Impl @@ -680,63 +421,22 @@ void HIP::impl_initialize(const HIP::SelectDevice config) { void HIP::impl_finalize() { Impl::HIPInternal::singleton().finalize(); } HIP::HIP() - : m_space_instance(&Impl::HIPInternal::singleton()), m_counter(nullptr) { + : m_space_instance(&Impl::HIPInternal::singleton(), + [](Impl::HIPInternal*) {}) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); } HIP::HIP(hipStream_t const stream) - : m_space_instance(new Impl::HIPInternal), m_counter(new int(1)) { + : m_space_instance(new Impl::HIPInternal, [](Impl::HIPInternal* ptr) { + ptr->finalize(); + delete ptr; + }) { Impl::HIPInternal::singleton().verify_is_initialized( "HIP instance constructor"); m_space_instance->initialize(Impl::HIPInternal::singleton().m_hipDev, stream); } -KOKKOS_FUNCTION HIP::HIP(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; -} - -KOKKOS_FUNCTION HIP::HIP(HIP const& other) - : m_space_instance(other.m_space_instance), m_counter(other.m_counter) { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP&& other) noexcept { - m_space_instance = other.m_space_instance; - other.m_space_instance = nullptr; - m_counter = other.m_counter; - other.m_counter = nullptr; - - return *this; -} - -KOKKOS_FUNCTION HIP& HIP::operator=(HIP const& other) { - m_space_instance = other.m_space_instance; - m_counter = other.m_counter; -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter) Kokkos::atomic_add(m_counter, 1); -#endif - - return *this; -} - -KOKKOS_FUNCTION HIP::~HIP() noexcept { -#ifndef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_HIP_GPU - if (m_counter == nullptr) return; - int const count = Kokkos::atomic_fetch_sub(m_counter, 1); - if (count == 1) { - delete m_counter; - m_space_instance->finalize(); - delete m_space_instance; - } -#endif -} - void HIP::print_configuration(std::ostream& s, const bool) { Impl::HIPInternal::singleton().print_configuration(s); } @@ -810,3 +510,26 @@ void HIPSpaceInitializer::print_configuration(std::ostream& msg, } // namespace Impl } // namespace Kokkos + +//============================================================================== +// {{{1 + +#include + +namespace Kokkos { +namespace Impl { + +// To avoid additional compilation cost for something that's (mostly?) not +// performance sensitive, we explicity instantiate these CRTP base classes here, +// where we have access to the associated *_timpl.hpp header files. +template class HostInaccessibleSharedAllocationRecordCommon< + Kokkos::Experimental::HIPSpace>; +template class SharedAllocationRecordCommon; +template class SharedAllocationRecordCommon< + Kokkos::Experimental::HIPHostPinnedSpace>; + +} // end namespace Impl +} // end namespace Kokkos + +// end Explicit instantiations of CRTP Base classes }}}1 +//============================================================================== diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index 7571510c31..fe52886ced 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -644,13 +644,14 @@ KOKKOS_INLINE_FUNCTION thread, count); } -template -KOKKOS_INLINE_FUNCTION - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const Impl::HIPTeamMember& thread, iType arg_begin, - iType arg_end) { +template +KOKKOS_INLINE_FUNCTION Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, Impl::HIPTeamMember> +ThreadVectorRange(const Impl::HIPTeamMember& thread, iType1 arg_begin, + iType2 arg_end) { + using iType = typename std::common_type::type; return Impl::ThreadVectorRangeBoundariesStruct( - thread, arg_begin, arg_end); + thread, iType(arg_begin), iType(arg_end)); } KOKKOS_INLINE_FUNCTION @@ -961,7 +962,7 @@ KOKKOS_INLINE_FUNCTION //---------------------------------------------------------------------------- -/** \brief Intra-thread vector parallel exclusive prefix sum. +/** \brief Intra-thread vector parallel scan with reducer. * * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) * @@ -969,22 +970,21 @@ KOKKOS_INLINE_FUNCTION * thread and a scan operation is performed. * The last call to closure has final == true. */ -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct& - loop_boundaries, - const Closure& closure) { +template +KOKKOS_INLINE_FUNCTION + typename std::enable_if::value>::type + parallel_scan(const Impl::ThreadVectorRangeBoundariesStruct< + iType, Impl::HIPTeamMember>& loop_boundaries, + const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - // Extract value_type from closure - - using value_type = typename Kokkos::Impl::FunctorAnalysis< - Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + using value_type = typename ReducerType::value_type; + value_type accum; + reducer.init(accum); + const value_type identity = accum; // Loop through boundaries by vector-length chunks // must scan at each iteration - value_type accum = 0; - // All thread "lanes" must loop the same number of times. // Determine an loop end for all thread "lanes." // Requires: @@ -997,47 +997,72 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( const int end = loop_boundaries.end + (rem ? blockDim.x - rem : 0); for (int i = threadIdx.x; i < end; i += blockDim.x) { - value_type val = 0; + value_type val = identity; - // First acquire per-lane contributions: - if (i < loop_boundaries.end) closure(i, val, false); + // First acquire per-lane contributions. + // This sets i's val to i-1's contribution + // to make the latter in_place_shfl_up an + // exclusive scan -- the final accumulation + // of i's val will be included in the second + // closure call later. + if (i < loop_boundaries.end && threadIdx.x > 0) closure(i - 1, val, false); - value_type sval = val; - - // Bottom up inclusive scan in triangular pattern + // Bottom up exclusive scan in triangular pattern // where each HIP thread is the root of a reduction tree // from the zeroth "lane" to itself. // [t] += [t-1] if t >= 1 // [t] += [t-2] if t >= 2 // [t] += [t-4] if t >= 4 // ... - + // This differs from the non-reducer overload, where an inclusive scan was + // implemented, because in general the binary operator cannot be inverted + // and we would not be able to remove the inclusive contribution by + // inversion. for (int j = 1; j < static_cast(blockDim.x); j <<= 1) { - value_type tmp = 0; - ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, sval, j, blockDim.x); + value_type tmp = identity; + ::Kokkos::Experimental::Impl::in_place_shfl_up(tmp, val, j, blockDim.x); if (j <= static_cast(threadIdx.x)) { - sval += tmp; + reducer.join(val, tmp); } } - // Include accumulation and remove value for exclusive scan: - val = accum + sval - val; + // Include accumulation + reducer.join(val, accum); - // Provide exclusive scan value: + // Update i's contribution into the val + // and add it to accum for next round if (i < loop_boundaries.end) closure(i, val, true); - - // Accumulate the last value in the inclusive scan: - ::Kokkos::Experimental::Impl::in_place_shfl(sval, sval, blockDim.x - 1, + ::Kokkos::Experimental::Impl::in_place_shfl(accum, val, blockDim.x - 1, blockDim.x); - - accum += sval; } #else (void)loop_boundaries; (void)closure; + (void)reducer; #endif } +//---------------------------------------------------------------------------- + +/** \brief Intra-thread vector parallel exclusive prefix sum. + * + * Executes closure(iType i, ValueType & val, bool final) for each i=[0..N) + * + * The range [0..N) is mapped to all vector lanes in the + * thread and a scan operation is performed. + * The last call to closure has final == true. + */ +template +KOKKOS_INLINE_FUNCTION void parallel_scan( + const Impl::ThreadVectorRangeBoundariesStruct& + loop_boundaries, + const Closure& closure) { + using value_type = typename Kokkos::Impl::FunctorAnalysis< + Kokkos::Impl::FunctorPatternInterface::SCAN, void, Closure>::value_type; + value_type dummy; + parallel_scan(loop_boundaries, closure, Kokkos::Sum(dummy)); +} + } // namespace Kokkos namespace Kokkos { diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 140376425c..b7d8e62f69 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -48,17 +48,11 @@ #include #include - +#include #include #include -#include #include -#if defined(KOKKOS_ENABLE_CUDA) || \ - (defined(__HIPCC__) && defined(KOKKOS_ENABLE_HIP)) -#include -#endif - namespace Kokkos { // ------------------------------------------------------------------ // @@ -74,22 +68,14 @@ enum class Iterate template struct default_outer_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; template struct default_inner_direction { - using type = Iterate; -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - static constexpr Iterate value = Iterate::Left; -#else + using type = Iterate; static constexpr Iterate value = Iterate::Right; -#endif }; // Iteration Pattern @@ -179,6 +165,25 @@ constexpr NVCC_WONT_LET_ME_CALL_YOU_Array to_array_potentially_narrowing( } return a; } + +struct TileSizeProperties { + int max_threads; + int default_largest_tile_size; + int default_tile_size; + int max_total_tile_size; +}; + +template +TileSizeProperties get_tile_size_properties(const ExecutionSpace&) { + // Host settings + TileSizeProperties properties; + properties.max_threads = std::numeric_limits::max(); + properties.default_largest_tile_size = 0; + properties.default_tile_size = 2; + properties.max_total_tile_size = std::numeric_limits::max(); + return properties; +} + } // namespace Impl // multi-dimensional iteration pattern @@ -208,7 +213,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { using launch_bounds = typename traits::launch_bounds; using member_type = typename range_policy::member_type; - enum { rank = static_cast(iteration_pattern::rank) }; + static constexpr int rank = iteration_pattern::rank; using index_type = typename traits::index_type; using array_index_type = std::int64_t; @@ -231,37 +236,20 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type m_tile_end = {}; index_type m_num_tiles = 1; index_type m_prod_tile_dims = 1; + bool m_tune_tile_size = false; - /* - // NDE enum impl definition alternative - replace static constexpr int ? - enum { outer_direction = static_cast ( - (iteration_pattern::outer_direction != Iterate::Default) - ? iteration_pattern::outer_direction - : default_outer_direction< typename traits::execution_space>::value ) }; - - enum { inner_direction = static_cast ( - iteration_pattern::inner_direction != Iterate::Default - ? iteration_pattern::inner_direction - : default_inner_direction< typename traits::execution_space>::value ) }; - - enum { Right = static_cast( Iterate::Right ) }; - enum { Left = static_cast( Iterate::Left ) }; - */ - // static constexpr int rank = iteration_pattern::rank; - - static constexpr int outer_direction = static_cast( + static constexpr auto outer_direction = (iteration_pattern::outer_direction != Iterate::Default) ? iteration_pattern::outer_direction - : default_outer_direction::value); + : default_outer_direction::value; - static constexpr int inner_direction = static_cast( + static constexpr auto inner_direction = iteration_pattern::inner_direction != Iterate::Default ? iteration_pattern::inner_direction - : default_inner_direction::value); + : default_inner_direction::value; - // Ugly ugly workaround intel 14 not handling scoped enum correctly - static constexpr int Right = static_cast(Iterate::Right); - static constexpr int Left = static_cast(Iterate::Left); + static constexpr auto Right = Iterate::Right; + static constexpr auto Left = Iterate::Left; KOKKOS_INLINE_FUNCTION const typename traits::execution_space& space() const { return m_space; @@ -320,7 +308,7 @@ struct MDRangePolicy : public Kokkos::Impl::PolicyTraits { point_type const& lower, point_type const& upper, tile_type const& tile = tile_type{}) : m_space(work_space), m_lower(lower), m_upper(upper), m_tile(tile) { - init(); + init_helper(Impl::get_tile_size_properties(work_space)); } template { m_tile(p.m_tile), m_tile_end(p.m_tile_end), m_num_tiles(p.m_num_tiles), - m_prod_tile_dims(p.m_prod_tile_dims) {} + m_prod_tile_dims(p.m_prod_tile_dims), + m_tune_tile_size(p.m_tune_tile_size) {} + + void impl_change_tile_size(const point_type& tile) { + m_tile = tile; + init_helper(Impl::get_tile_size_properties(m_space)); + } + bool impl_tune_tile_size() const { return m_tune_tile_size; } private: - void init() { - // Host - if (true -#if defined(KOKKOS_ENABLE_CUDA) - && !std::is_same::value -#endif -#if defined(KOKKOS_ENABLE_HIP) - && !std::is_same::value -#endif - ) { - index_type span; - for (int i = 0; i < rank; ++i) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - m_tile[i] = 2; - } else { - m_tile[i] = (span == 0 ? 1 : span); - } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } + void init_helper(Impl::TileSizeProperties properties) { + m_prod_tile_dims = 1; + int increment = 1; + int rank_start = 0; + int rank_end = rank; + if (inner_direction == Iterate::Right) { + increment = -1; + rank_start = rank - 1; + rank_end = -1; } -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_HIP) - else // Cuda or HIP - { - index_type span; - int increment = 1; - int rank_start = 0; - int rank_end = rank; - if ((int)inner_direction == (int)Right) { - increment = -1; - rank_start = rank - 1; - rank_end = -1; - } - bool is_cuda_exec_space = -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value; -#else - false; -#endif - for (int i = rank_start; i != rank_end; i += increment) { - span = m_upper[i] - m_lower[i]; - if (m_tile[i] <= 0) { - // TODO: determine what is a good default tile size for Cuda and HIP - // may be rank dependent - if (((int)inner_direction == (int)Right && (i < rank - 1)) || - ((int)inner_direction == (int)Left && (i > 0))) { - if (m_prod_tile_dims < 256) { - m_tile[i] = (is_cuda_exec_space) ? 2 : 4; - } else { - m_tile[i] = 1; - } + for (int i = rank_start; i != rank_end; i += increment) { + const index_type length = m_upper[i] - m_lower[i]; + if (m_tile[i] <= 0) { + m_tune_tile_size = true; + if ((inner_direction == Iterate::Right && (i < rank - 1)) || + (inner_direction == Iterate::Left && (i > 0))) { + if (m_prod_tile_dims * properties.default_tile_size < + static_cast(properties.max_total_tile_size)) { + m_tile[i] = properties.default_tile_size; } else { - m_tile[i] = 16; + m_tile[i] = 1; } - } - m_tile_end[i] = - static_cast((span + m_tile[i] - 1) / m_tile[i]); - m_num_tiles *= m_tile_end[i]; - m_prod_tile_dims *= m_tile[i]; - } - if (m_prod_tile_dims > - 1024) { // Match Cuda restriction for ParallelReduce; 1024,1024,64 - // max per dim (Kepler), but product num_threads < 1024 - if (is_cuda_exec_space) { - printf(" Tile dimensions exceed Cuda limits\n"); - Kokkos::abort( - "Cuda ExecSpace Error: MDRange tile dims exceed maximum number " - "of threads per block - choose smaller tile dims"); } else { - printf(" Tile dimensions exceed HIP limits\n"); - Kokkos::abort( - "HIP ExecSpace Error: MDRange tile dims exceed maximum number of " - "threads per block - choose smaller tile dims"); + m_tile[i] = properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; } } + m_tile_end[i] = + static_cast((length + m_tile[i] - 1) / m_tile[i]); + m_num_tiles *= m_tile_end[i]; + m_prod_tile_dims *= m_tile[i]; + } + if (m_prod_tile_dims > static_cast(properties.max_threads)) { + printf(" Product of tile dimensions exceed maximum limit: %d\n", + static_cast(properties.max_threads)); + Kokkos::abort( + "ExecSpace Error: MDRange tile dims exceed maximum number " + "of threads per block - choose smaller tile dims"); } -#endif } }; diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 8e226a078d..fb94049d7a 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -104,20 +104,6 @@ struct MemorySpaceAccess { enum : bool { deepcopy = true }; }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} -}; - } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index fb2925a066..6578723fc8 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -45,14 +45,13 @@ #define KOKKOS_COMPLEX_HPP #include +#include #include +#include #include +#include #include -#ifdef KOKKOS_ENABLE_SYCL -#include -#endif - namespace Kokkos { /// \class complex @@ -220,10 +219,11 @@ class // Conditional noexcept, just in case RType throws on divide-by-zero KOKKOS_CONSTEXPR_14 KOKKOS_INLINE_FUNCTION complex& operator/=( const complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -248,10 +248,11 @@ class KOKKOS_INLINE_FUNCTION complex& operator/=( const std::complex& y) noexcept(noexcept(RealType{} / RealType{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. - const RealType s = std::fabs(y.real()) + std::fabs(y.imag()); + const RealType s = fabs(y.real()) + fabs(y.imag()); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -693,35 +694,96 @@ KOKKOS_INLINE_FUNCTION RealType real(const complex& x) noexcept { return x.real(); } +//! Constructs a complex number from magnitude and phase angle +template +KOKKOS_INLINE_FUNCTION complex polar(const T& r, const T& theta = T()) { + using Kokkos::Experimental::cos; + using Kokkos::Experimental::sin; + KOKKOS_EXPECTS(r >= 0); + return complex(r * cos(theta), r * sin(theta)); +} + //! Absolute value (magnitude) of a complex number. template KOKKOS_INLINE_FUNCTION RealType abs(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::hypot; -#else - using std::hypot; -#endif + using Kokkos::Experimental::hypot; return hypot(x.real(), x.imag()); } //! Power of a complex number -template -KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, - const RealType& e) { - RealType r = abs(x); -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::cos; - using cl::sycl::pow; - using cl::sycl::sin; -#else - using std::atan; - using std::cos; - using std::pow; - using std::sin; -#endif - RealType phi = atan(x.imag() / x.real()); - return pow(r, e) * Kokkos::complex(cos(phi * e), sin(phi * e)); +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, const T& y) { + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::pow; + T r = abs(x); + T theta = atan2(x.imag(), x.real()); + return polar(pow(r, y), y * theta); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const T& x, const complex& y) { + return pow(complex(x), y); +} + +template +KOKKOS_INLINE_FUNCTION complex pow(const complex& x, + const complex& y) { + using Kokkos::Experimental::log; + + return x == T() ? T() : exp(y * log(x)); +} + +namespace Impl { +// NOTE promote would also be useful for math functions +template ::value> +struct promote { + using type = double; +}; +template +struct promote {}; +template <> +struct promote { + using type = long double; +}; +template <> +struct promote { + using type = double; +}; +template <> +struct promote { + using type = float; +}; +template +using promote_t = typename promote::type; +template +struct promote_2 { + using type = decltype(promote_t() + promote_t()); +}; +template +using promote_2_t = typename promote_2::type; +} // namespace Impl + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow( + const T& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(type(x), complex(y)); +} + +template ::value>> +KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, + const U& y) { + using type = Impl::promote_2_t; + return pow(complex(x), type(y)); +} + +template +KOKKOS_INLINE_FUNCTION complex> pow( + const complex& x, const complex& y) { + using type = Impl::promote_2_t; + return pow(complex(x), complex(y)); } //! Square root of a complex number. This is intended to match the stdc++ @@ -729,26 +791,21 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex pow(const complex& x, template KOKKOS_INLINE_FUNCTION Kokkos::complex sqrt( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::abs; - using cl::sycl::sqrt; -#else - using std::abs; - using std::sqrt; -#endif + using Kokkos::Experimental::fabs; + using Kokkos::Experimental::sqrt; RealType r = x.real(); RealType i = x.imag(); if (r == RealType()) { - RealType t = sqrt(abs(i) / 2); + RealType t = sqrt(fabs(i) / 2); return Kokkos::complex(t, i < RealType() ? -t : t); } else { - RealType t = sqrt(2 * (abs(x) + abs(r))); + RealType t = sqrt(2 * (abs(x) + fabs(r))); RealType u = t / 2; - return r > RealType() - ? Kokkos::complex(u, i / t) - : Kokkos::complex(abs(i) / t, i < RealType() ? -u : u); + return r > RealType() ? Kokkos::complex(u, i / t) + : Kokkos::complex(fabs(i) / t, + i < RealType() ? -u : u); } } @@ -762,15 +819,9 @@ KOKKOS_INLINE_FUNCTION complex conj( //! Exponential of a complex number. template KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::exp; - using cl::sycl::sin; -#else - using std::cos; - using std::exp; - using std::sin; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::exp; + using Kokkos::Experimental::sin; return exp(x.real()) * complex(cos(x.imag()), sin(x.imag())); } @@ -778,14 +829,9 @@ KOKKOS_INLINE_FUNCTION complex exp(const complex& x) { template KOKKOS_INLINE_FUNCTION Kokkos::complex log( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan; - using cl::sycl::log; -#else - using std::atan; - using std::log; -#endif - RealType phi = atan(x.imag() / x.real()); + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; + RealType phi = atan2(x.imag(), x.real()); return Kokkos::complex(log(abs(x)), phi); } @@ -793,17 +839,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex log( template KOKKOS_INLINE_FUNCTION Kokkos::complex sin( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sin(x.real()) * cosh(x.imag()), cos(x.real()) * sinh(x.imag())); } @@ -812,17 +851,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sin( template KOKKOS_INLINE_FUNCTION Kokkos::complex cos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cos(x.real()) * cosh(x.imag()), -sin(x.real()) * sinh(x.imag())); } @@ -838,17 +870,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex tan( template KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(sinh(x.real()) * cos(x.imag()), cosh(x.real()) * sin(x.imag())); } @@ -857,17 +882,10 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex sinh( template KOKKOS_INLINE_FUNCTION Kokkos::complex cosh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::cos; - using cl::sycl::cosh; - using cl::sycl::sin; - using cl::sycl::sinh; -#else - using std::cos; - using std::cosh; - using std::sin; - using std::sinh; -#endif + using Kokkos::Experimental::cos; + using Kokkos::Experimental::cosh; + using Kokkos::Experimental::sin; + using Kokkos::Experimental::sinh; return Kokkos::complex(cosh(x.real()) * cos(x.imag()), sinh(x.real()) * sin(x.imag())); } @@ -898,13 +916,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acosh( template KOKKOS_INLINE_FUNCTION Kokkos::complex atanh( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType i2 = x.imag() * x.imag(); const RealType r = RealType(1.0) - i2 - x.real() * x.real(); @@ -933,12 +946,7 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex asin( template KOKKOS_INLINE_FUNCTION Kokkos::complex acos( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::acos; - -#else - using std::acos; -#endif + using Kokkos::Experimental::acos; Kokkos::complex t = asin(x); RealType pi_2 = acos(RealType(0.0)); return Kokkos::complex(pi_2 - t.real(), -t.imag()); @@ -948,13 +956,8 @@ KOKKOS_INLINE_FUNCTION Kokkos::complex acos( template KOKKOS_INLINE_FUNCTION Kokkos::complex atan( const complex& x) { -#ifdef KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_SYCL - using cl::sycl::atan2; - using cl::sycl::log; -#else - using std::atan2; - using std::log; -#endif + using Kokkos::Experimental::atan2; + using Kokkos::Experimental::log; const RealType r2 = x.real() * x.real(); const RealType i = RealType(1.0) - r2 - x.imag() * x.imag(); @@ -996,12 +999,13 @@ KOKKOS_INLINE_FUNCTION operator/(const complex& x, const complex& y) noexcept(noexcept(RealType1{} / RealType2{})) { + using Kokkos::Experimental::fabs; // Scale (by the "1-norm" of y) to avoid unwarranted overflow. // If the real part is +/-Inf and the imaginary part is -/+Inf, // this won't change the result. using common_real_type = typename std::common_type::type; - const common_real_type s = std::fabs(real(y)) + std::fabs(imag(y)); + const common_real_type s = fabs(real(y)) + fabs(imag(y)); // If s is 0, then y is zero, so x/y == real(x)/0 + i*imag(x)/0. // In that case, the relation x/y == (x/s) / (y/s) doesn't hold, @@ -1046,7 +1050,7 @@ std::istream& operator>>(std::istream& is, complex& x) { } template -struct reduction_identity > { +struct reduction_identity> { using t_red_ident = reduction_identity; KOKKOS_FORCEINLINE_FUNCTION constexpr static Kokkos::complex sum() noexcept { diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 4dac463a66..c3771ab393 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -58,6 +58,7 @@ #include #include #include +#include #include #include #include @@ -86,6 +87,10 @@ struct InitArguments { int skip_device; bool disable_warnings; bool tune_internals; + bool tool_help = false; + std::string tool_lib = {}; + std::string tool_args = {}; + InitArguments(int nt = -1, int nn = -1, int dv = -1, bool dw = false, bool ti = false) : num_threads{nt}, @@ -139,6 +144,10 @@ void pre_initialize(const InitArguments& args); void post_initialize(const InitArguments& args); +void declare_configuration_metadata(const std::string& category, + const std::string& key, + const std::string& value); + } // namespace Impl bool is_initialized() noexcept; diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7502719c73..fe7eba3f6e 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -50,6 +50,7 @@ // and compiler environment then sets a collection of #define macros. #include +#include #include #include @@ -180,7 +181,6 @@ using DefaultHostExecutionSpace KOKKOS_IMPL_DEFAULT_HOST_EXEC_SPACE_ANNOTATION = // a given memory space. namespace Kokkos { - namespace Impl { #if defined(KOKKOS_ACTIVE_EXECUTION_MEMORY_SPACE_CUDA) && \ @@ -196,16 +196,22 @@ using ActiveExecutionMemorySpace = Kokkos::HostSpace; using ActiveExecutionMemorySpace = void; #endif -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 0 }; +template +struct MemorySpaceAccess; + +template ::accessible> +struct verify_space { + KOKKOS_FUNCTION static void check() {} }; -template -struct VerifyExecutionCanAccessMemorySpace { - enum { value = 1 }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void *) {} +template +struct verify_space { + KOKKOS_FUNCTION static void check() { + Kokkos::abort( + "Kokkos::View ERROR: attempt to access inaccessible memory space"); + }; }; // Base class for exec space initializer factories @@ -220,13 +226,13 @@ class LogicalMemorySpace; } // namespace Kokkos -#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify(DATA_PTR) +#define KOKKOS_RESTRICT_EXECUTION_TO_DATA(DATA_SPACE, DATA_PTR) \ + Kokkos::Impl::verify_space::check(); -#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ - Kokkos::Impl::VerifyExecutionCanAccessMemorySpace< \ - Kokkos::Impl::ActiveExecutionMemorySpace, DATA_SPACE>::verify() +#define KOKKOS_RESTRICT_EXECUTION_TO_(DATA_SPACE) \ + Kokkos::Impl::verify_space::check(); //---------------------------------------------------------------------------- @@ -256,8 +262,7 @@ template struct ViewCopy; -template +template struct FunctorPolicyExecutionSpace; //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index 4a573d82c0..1a10500b19 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -199,7 +199,7 @@ class CrsRowMapFromCounts { public: KOKKOS_INLINE_FUNCTION void operator()(index_type i, value_type& update, bool final_pass) const { - if (i < m_in.size()) { + if (i < static_cast(m_in.size())) { update += m_in(i); if (final_pass) m_out(i + 1) = update; } else if (final_pass) { diff --git a/lib/kokkos/core/src/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Kokkos_Cuda.hpp index 81e11f3f12..7a218120bb 100644 --- a/lib/kokkos/core/src/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Kokkos_Cuda.hpp @@ -63,6 +63,7 @@ #include #include #include +#include /*--------------------------------------------------------------------------*/ @@ -198,16 +199,6 @@ class Cuda { Cuda(); - KOKKOS_FUNCTION Cuda(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda(const Cuda& other); - - KOKKOS_FUNCTION Cuda& operator=(Cuda&& other) noexcept; - - KOKKOS_FUNCTION Cuda& operator=(const Cuda& other); - - KOKKOS_FUNCTION ~Cuda() noexcept; - Cuda(cudaStream_t stream); //-------------------------------------------------------------------------- @@ -253,13 +244,12 @@ class Cuda { static const char* name(); inline Impl::CudaInternal* impl_internal_space_instance() const { - return m_space_instance; + return m_space_instance.get(); } uint32_t impl_instance_id() const noexcept { return 0; } private: - Impl::CudaInternal* m_space_instance; - int* m_counter; + Kokkos::Impl::HostSharedPtr m_space_instance; }; namespace Tools { @@ -319,38 +309,8 @@ struct MemorySpaceAccess -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - } // namespace Impl } // namespace Kokkos -/*--------------------------------------------------------------------------*/ -/*--------------------------------------------------------------------------*/ - -#include -#include -#include -#include -#include -#include -#include - -#include -//---------------------------------------------------------------------------- - #endif /* #if defined( KOKKOS_ENABLE_CUDA ) */ #endif /* #ifndef KOKKOS_CUDA_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp index fc1c0e2f8a..e10fae93c7 100644 --- a/lib/kokkos/core/src/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_CudaSpace.hpp @@ -53,8 +53,10 @@ #include #include #include +#include #include +#include #include @@ -119,8 +121,8 @@ class CudaSpace { /*--------------------------------*/ /** \brief Error reporting for HostSpace attempt to access CudaSpace */ - static void access_error(); - static void access_error(const void* const); + KOKKOS_DEPRECATED static void access_error(); + KOKKOS_DEPRECATED static void access_error(const void* const); private: int m_device; ///< Which Cuda device @@ -128,42 +130,6 @@ class CudaSpace { static constexpr const char* m_name = "Cuda"; friend class Kokkos::Impl::SharedAllocationRecord; }; - -namespace Impl { -/// \brief Initialize lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function initializes the locks to zero (unset). -void init_lock_arrays_cuda_space(); - -/// \brief Retrieve the pointer to the lock array for arbitrary size atomics. -/// -/// Arbitrary atomics are implemented using a hash table of locks -/// where the hash value is derived from the address of the -/// object for which an atomic operation is performed. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* atomic_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for team and thread private -/// global memory. -/// -/// Team and Thread private scratch allocations in -/// global memory are acquired via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* scratch_lock_array_cuda_space_ptr(bool deallocate = false); - -/// \brief Retrieve the pointer to the scratch array for unique identifiers. -/// -/// Unique identifiers in the range 0-Cuda::concurrency -/// are provided via locks. -/// This function retrieves the lock array pointer. -/// If the array is not yet allocated it will do so. -int* threadid_lock_array_cuda_space_ptr(bool deallocate = false); -} // namespace Impl } // namespace Kokkos /*--------------------------------------------------------------------------*/ @@ -313,6 +279,11 @@ class CudaHostPinnedSpace { namespace Kokkos { namespace Impl { +cudaStream_t cuda_get_deep_copy_stream(); + +const std::unique_ptr& cuda_get_deep_copy_space( + bool initialize = true); + static_assert(Kokkos::Impl::MemorySpaceAccess::assignable, ""); @@ -784,104 +755,21 @@ struct DeepCopy { namespace Kokkos { namespace Impl { -/** Running in CudaSpace attempting to access HostSpace: error */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access HostSpace memory"); - } -}; - -/** Running in CudaSpace accessing CudaUVMSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace accessing CudaHostPinnedSpace: ok */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -/** Running in CudaSpace attempting to access an unknown space: error */ -template -struct VerifyExecutionCanAccessMemorySpace< - typename std::enable_if::value, - Kokkos::CudaSpace>::type, - OtherSpace> { - enum : bool { value = false }; - KOKKOS_INLINE_FUNCTION static void verify(void) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } - - KOKKOS_INLINE_FUNCTION static void verify(const void*) { - Kokkos::abort("Cuda code attempted to access unknown Space memory"); - } -}; - -//---------------------------------------------------------------------------- -/** Running in HostSpace attempting to access CudaSpace */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = false }; - inline static void verify(void) { CudaSpace::access_error(); } - inline static void verify(const void* p) { CudaSpace::access_error(p); } -}; - -/** Running in HostSpace accessing CudaUVMSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - inline static void verify(void) {} - inline static void verify(const void*) {} -}; - -/** Running in HostSpace accessing CudaHostPinnedSpace is OK */ -template <> -struct VerifyExecutionCanAccessMemorySpace { - enum : bool { value = true }; - KOKKOS_INLINE_FUNCTION static void verify(void) {} - KOKKOS_INLINE_FUNCTION static void verify(const void*) {} -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public HostInaccessibleSharedAllocationRecordCommon { private: friend class SharedAllocationRecord; + friend class SharedAllocationRecordCommon; + friend class HostInaccessibleSharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; + using base_t = + HostInaccessibleSharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static ::cudaTextureObject_t attach_texture_object( const unsigned sizeof_alias, void* const alloc_ptr, const size_t alloc_size); @@ -890,39 +778,19 @@ class SharedAllocationRecord static RecordBase s_root_record; #endif - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -945,57 +813,35 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + + using base_t = SharedAllocationRecordCommon; using RecordBase = SharedAllocationRecord; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; - ::cudaTextureObject_t m_tex_obj; + ::cudaTextureObject_t m_tex_obj = 0; const Kokkos::CudaUVMSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_tex_obj(0), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaUVMSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, - const RecordBase::function_type arg_dealloc = &deallocate); + const RecordBase::function_type arg_dealloc = &base_t::deallocate); public: - std::string get_label() const; - - static SharedAllocationRecord* allocate(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaUVMSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - template inline ::cudaTextureObject_t attach_texture_object() { static_assert((std::is_same::value || @@ -1019,57 +865,32 @@ class SharedAllocationRecord // Texture object is attached to the entire allocation range return ptr - reinterpret_cast(RecordBase::m_alloc_ptr); } - - static void print_records(std::ostream&, const Kokkos::CudaUVMSpace&, - bool detail = false); }; template <> class SharedAllocationRecord - : public SharedAllocationRecord { + : public SharedAllocationRecordCommon { private: + friend class SharedAllocationRecordCommon; + using RecordBase = SharedAllocationRecord; + using base_t = SharedAllocationRecordCommon; SharedAllocationRecord(const SharedAllocationRecord&) = delete; SharedAllocationRecord& operator=(const SharedAllocationRecord&) = delete; - static void deallocate(RecordBase*); - static RecordBase s_root_record; const Kokkos::CudaHostPinnedSpace m_space; protected: ~SharedAllocationRecord(); - SharedAllocationRecord() : RecordBase(), m_space() {} + SharedAllocationRecord() = default; SharedAllocationRecord( const Kokkos::CudaHostPinnedSpace& arg_space, const std::string& arg_label, const size_t arg_alloc_size, const RecordBase::function_type arg_dealloc = &deallocate); - - public: - std::string get_label() const; - - static SharedAllocationRecord* allocate( - const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, const size_t arg_alloc_size); - /**\brief Allocate tracked memory in the space */ - static void* allocate_tracked(const Kokkos::CudaHostPinnedSpace& arg_space, - const std::string& arg_label, - const size_t arg_alloc_size); - - /**\brief Reallocate tracked memory in the space */ - static void* reallocate_tracked(void* const arg_alloc_ptr, - const size_t arg_alloc_size); - - /**\brief Deallocate tracked memory in the space */ - static void deallocate_tracked(void* const arg_alloc_ptr); - - static SharedAllocationRecord* get_record(void* arg_alloc_ptr); - - static void print_records(std::ostream&, const Kokkos::CudaHostPinnedSpace&, - bool detail = false); }; } // namespace Impl diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index 3afe081701..55aed13670 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -856,11 +856,12 @@ KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct ThreadVectorRange(const TeamMemberType&, const iType& count) = delete; -template -KOKKOS_INLINE_FUNCTION_DELETED - Impl::ThreadVectorRangeBoundariesStruct - ThreadVectorRange(const TeamMemberType&, const iType& arg_begin, - const iType& arg_end) = delete; +template +KOKKOS_INLINE_FUNCTION_DELETED Impl::ThreadVectorRangeBoundariesStruct< + typename std::common_type::type, TeamMemberType> +ThreadVectorRange(const TeamMemberType&, const iType1& arg_begin, + const iType2& arg_end) = delete; namespace Impl { @@ -902,85 +903,6 @@ struct ParallelConstructName { } // namespace Kokkos namespace Kokkos { -namespace Experimental { - -namespace Impl { -template -struct PolicyPropertyAdaptor; - -template class Policy, - class... Properties> -struct PolicyPropertyAdaptor, - Policy> { - using policy_in_t = Policy; - static_assert(is_execution_policy::value, ""); - using policy_out_t = Policy, - typename policy_in_t::traits::occupancy_control>; -}; - -template