diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml index c0c3e3f89a..2691b9e895 100644 --- a/.github/workflows/coverity.yml +++ b/.github/workflows/coverity.yml @@ -67,7 +67,6 @@ jobs: -D PKG_MANIFOLD=on \ -D PKG_MDI=on \ -D PKG_MGPT=on \ - -D PKG_ML-PACE=on \ -D PKG_ML-RANN=on \ -D PKG_MOLFILE=on \ -D PKG_NETCDF=on \ diff --git a/.github/workflows/lammps-gui-flatpak.yml b/.github/workflows/lammps-gui-flatpak.yml new file mode 100644 index 0000000000..d7dc602476 --- /dev/null +++ b/.github/workflows/lammps-gui-flatpak.yml @@ -0,0 +1,53 @@ +# GitHub action to build LAMMPS-GUI as a flatpak bundle +name: "Build LAMMPS-GUI as flatpak bundle" + +on: + push: + branches: + - develop + + workflow_dispatch: + +concurrency: + group: ${{ github.event_name }}-${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: ${{github.event_name == 'pull_request'}} + +jobs: + build: + name: LAMMPS-GUI flatpak build + if: ${{ github.repository == 'lammps/lammps' }} + runs-on: ubuntu-latest + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: 2 + + - name: Install extra packages + run: | + sudo apt-get update + sudo apt-get install -y ccache \ + libeigen3-dev \ + libcurl4-openssl-dev \ + mold \ + ninja-build \ + python3-dev \ + flatpak \ + flatpak-builder + + - name: Set up access to flatpak repo + run: flatpak --user remote-add --if-not-exists flathub https://dl.flathub.org/repo/flathub.flatpakrepo + + - name: Build flatpak + run: | + mkdir flatpack-state + sed -i -e 's/branch:.*/branch: develop/' tools/lammps-gui/org.lammps.lammps-gui.yml + flatpak-builder --force-clean --verbose --repo=flatpak-repo \ + --install-deps-from=flathub --state-dir=flatpak-state \ + --user --ccache --default-branch=${{ github.ref_name }} \ + flatpak-build tools/lammps-gui/org.lammps.lammps-gui.yml + flatpak build-bundle --runtime-repo=https://flathub.org/repo/flathub.flatpakrepo \ + --verbose flatpak-repo LAMMPS-Linux-x86_64-GUI.flatpak \ + org.lammps.lammps-gui ${{ github.ref_name }} + flatpak install -y -v --user LAMMPS-Linux-x86_64-GUI.flatpak diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index cf10e8b544..ce7b9f30f9 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -3,6 +3,9 @@ # CMake build system # This file is part of LAMMPS cmake_minimum_required(VERSION 3.16) +if(CMAKE_VERSION VERSION_LESS 3.20) + message(WARNING "LAMMPS is planning to require at least CMake version 3.20 by Summer 2025. Please upgrade!") +endif() ######################################## # set policy to silence warnings about ignoring _ROOT but use it if(POLICY CMP0074) @@ -95,24 +98,21 @@ check_for_autogen_files(${LAMMPS_SOURCE_DIR}) ##################################################################### include(CheckIncludeFileCXX) -# set required compiler flags and compiler/CPU arch specific optimizations +# set required compiler flags, apply checks, and compiler/CPU arch specific optimizations if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") + # Intel classic compilers version 19 are broken and fail to compile the embedded fmtlib + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 20.0) + message(ERROR "Intel classic compiler version ${CMAKE_CXX_COMPILER_VERSION} is too old") + endif() + if(CMAKE_SYSTEM_NAME STREQUAL "Windows") if(CMAKE_CXX_COMPILER_ID STREQUAL "Intel") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /Qrestrict") endif() - if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4) - set(CMAKE_TUNE_DEFAULT "/QxCOMMON-AVX512") - else() - set(CMAKE_TUNE_DEFAULT "/QxHost") - endif() + set(CMAKE_TUNE_DEFAULT "/QxHost") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -restrict") - if(CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.3 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 17.4) - set(CMAKE_TUNE_DEFAULT "-xCOMMON-AVX512") - else() - set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=11074 -diag-disable=11076 -diag-disable=2196") - endif() + set(CMAKE_TUNE_DEFAULT "-xHost -fp-model fast=2 -no-prec-div -qoverride-limits -diag-disable=10441 -diag-disable=11074 -diag-disable=11076 -diag-disable=2196") endif() endif() @@ -144,16 +144,28 @@ if((PKG_KOKKOS) AND (Kokkos_ENABLE_CUDA) AND NOT (CMAKE_CXX_COMPILER_ID STREQUAL set(CMAKE_TUNE_DEFAULT "${CMAKE_TUNE_DEFAULT}" "-Xcudafe --diag_suppress=unrecognized_pragma,--diag_suppress=128") endif() -# we require C++11 without extensions. Kokkos requires at least C++17 (currently) +# we *require* C++11 without extensions but prefer C++17. +# Kokkos requires at least C++17 (currently) if(NOT CMAKE_CXX_STANDARD) - set(CMAKE_CXX_STANDARD 11) + if(cxx_std_17 IN_LIST CMAKE_CXX_COMPILE_FEATURES) + set(CMAKE_CXX_STANDARD 17) + else() + set(CMAKE_CXX_STANDARD 11) + endif() endif() if(CMAKE_CXX_STANDARD LESS 11) message(FATAL_ERROR "C++ standard must be set to at least 11") endif() +if(CMAKE_CXX_STANDARD LESS 17) + message(WARNING "Selecting C++17 standard is preferred over C++${CMAKE_CXX_STANDARD}") +endif() if(PKG_KOKKOS AND (CMAKE_CXX_STANDARD LESS 17)) set(CMAKE_CXX_STANDARD 17) endif() +# turn off C++17 check in lmptype.h +if(LAMMPS_CXX11) + add_compile_definitions(LAMMPS_CXX11) +endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Use compiler extensions") # ugly hacks for MSVC which by default always reports an old C++ standard in the __cplusplus macro @@ -347,6 +359,17 @@ foreach(PKG ${STANDARD_PACKAGES} ${SUFFIX_PACKAGES}) option(PKG_${PKG} "Build ${PKG} Package" OFF) endforeach() +set(DEPRECATED_PACKAGES AWPMD ATC POEMS) +foreach(PKG ${DEPRECATED_PACKAGES}) + if(PKG_${PKG}) + message(WARNING + "The ${PKG} package will be removed from LAMMPS in Summer 2025 due to lack of " + "maintenance and use of code constructs that conflict with modern C++ compilers " + "and standards. Please contact developers@lammps.org if you have any concerns " + "about this step.") + endif() +endforeach() + ###################################################### # packages with special compiler needs or external libs ###################################################### @@ -579,6 +602,16 @@ foreach(PKG_WITH_INCL KSPACE PYTHON ML-IAP VORONOI COLVARS ML-HDNNP MDI MOLFILE endif() endforeach() +# settings for misc packages and styles +if(PKG_MISC) + option(LAMMPS_ASYNC_IMD "Asynchronous IMD processing" OFF) + mark_as_advanced(LAMMPS_ASYNC_IMD) + if(LAMMPS_ASYNC_IMD) + target_compile_definitions(lammps PRIVATE -DLAMMPS_ASYNC_IMD) + message(STATUS "Using IMD in asynchronous mode") + endif() +endif() + # optionally enable building script wrappers using swig option(WITH_SWIG "Build scripting language wrappers with SWIG" OFF) if(WITH_SWIG) @@ -1078,12 +1111,15 @@ if(BUILD_TOOLS) message(STATUS "<<< Building Tools >>>") endif() if(BUILD_LAMMPS_GUI) - message(STATUS "<<< Building LAMMPS GUI >>>") + message(STATUS "<<< Building LAMMPS-GUI >>>") if(LAMMPS_GUI_USE_PLUGIN) message(STATUS "Loading LAMMPS library as plugin at run time") else() message(STATUS "Linking LAMMPS library at compile time") endif() + if(BUILD_WHAM) + message(STATUS "<<< Building WHAM >>>") + endif() endif() if(ENABLE_TESTING) message(STATUS "<<< Building Unit Tests >>>") diff --git a/cmake/Modules/Packages/KOKKOS.cmake b/cmake/Modules/Packages/KOKKOS.cmake index ddd2daefcd..2fa5a449fb 100644 --- a/cmake/Modules/Packages/KOKKOS.cmake +++ b/cmake/Modules/Packages/KOKKOS.cmake @@ -7,26 +7,13 @@ endif() ######################################################################## # consistency checks and Kokkos options/settings required by LAMMPS -if(Kokkos_ENABLE_CUDA) - option(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC "CUDA asynchronous malloc support" OFF) - mark_as_advanced(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) - if(Kokkos_ENABLE_IMPL_CUDA_MALLOC_ASYNC) - message(STATUS "KOKKOS: CUDA malloc async support enabled") - else() - message(STATUS "KOKKOS: CUDA malloc async support disabled") - endif() -endif() if(Kokkos_ENABLE_HIP) option(Kokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS "Enable multiple kernel instantiations with HIP" ON) mark_as_advanced(Kokkos_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS) option(Kokkos_ENABLE_ROCTHRUST "Use RoCThrust library" ON) mark_as_advanced(Kokkos_ENABLE_ROCTHRUST) - - if(Kokkos_ARCH_AMD_GFX942 OR Kokkos_ARCH_AMD_GFX940) - option(Kokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY "Enable unified memory with HIP" ON) - mark_as_advanced(Kokkos_ENABLE_IMPL_HIP_UNIFIED_MEMORY) - endif() endif() + # Adding OpenMP compiler flags without the checks done for # BUILD_OMP can result in compile failures. Enforce consistency. if(Kokkos_ENABLE_OPENMP) @@ -70,8 +57,8 @@ if(DOWNLOAD_KOKKOS) list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_CXX_EXTENSIONS=${CMAKE_CXX_EXTENSIONS}") list(APPEND KOKKOS_LIB_BUILD_ARGS "-DCMAKE_TOOLCHAIN_FILE=${CMAKE_TOOLCHAIN_FILE}") include(ExternalProject) - set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.4.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") - set(KOKKOS_MD5 "de6ee80d00b6212b02bfb7f1e71a8392" CACHE STRING "MD5 checksum of KOKKOS tarball") + set(KOKKOS_URL "https://github.com/kokkos/kokkos/archive/4.5.01.tar.gz" CACHE STRING "URL for KOKKOS tarball") + set(KOKKOS_MD5 "4d832aa0284169d9e3fbae3165286bc6" CACHE STRING "MD5 checksum of KOKKOS tarball") mark_as_advanced(KOKKOS_URL) mark_as_advanced(KOKKOS_MD5) GetFallbackURL(KOKKOS_URL KOKKOS_FALLBACK) @@ -96,7 +83,7 @@ if(DOWNLOAD_KOKKOS) add_dependencies(LAMMPS::KOKKOSCORE kokkos_build) add_dependencies(LAMMPS::KOKKOSCONTAINERS kokkos_build) elseif(EXTERNAL_KOKKOS) - find_package(Kokkos 4.4.01 REQUIRED CONFIG) + find_package(Kokkos 4.5.01 REQUIRED CONFIG) target_link_libraries(lammps PRIVATE Kokkos::kokkos) else() set(LAMMPS_LIB_KOKKOS_SRC_DIR ${LAMMPS_LIB_SOURCE_DIR}/kokkos) diff --git a/cmake/Modules/Packages/ML-PACE.cmake b/cmake/Modules/Packages/ML-PACE.cmake index 8660898138..b30c61b8e4 100644 --- a/cmake/Modules/Packages/ML-PACE.cmake +++ b/cmake/Modules/Packages/ML-PACE.cmake @@ -1,50 +1,62 @@ # PACE library support for ML-PACE package +find_package(pace QUIET) -# set policy to silence warnings about timestamps of downloaded files. review occasionally if it may be set to NEW -if(POLICY CMP0135) - cmake_policy(SET CMP0135 OLD) -endif() - -set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.11.25.fix.tar.gz" CACHE STRING "URL for PACE evaluator library sources") -set(PACELIB_MD5 "b45de9a633f42ed65422567e3ce56f9f" CACHE STRING "MD5 checksum of PACE evaluator library tarball") -mark_as_advanced(PACELIB_URL) -mark_as_advanced(PACELIB_MD5) -GetFallbackURL(PACELIB_URL PACELIB_FALLBACK) - -# LOCAL_ML-PACE points to top-level dir with local lammps-user-pace repo, -# to make it easier to check local build without going through the public github releases -if(LOCAL_ML-PACE) - set(lib-pace "${LOCAL_ML-PACE}") +if(pace_FOUND) + find_package(pace) + target_link_libraries(lammps PRIVATE pace::pace) else() - # download library sources to build folder - if(EXISTS ${CMAKE_BINARY_DIR}/libpace.tar.gz) - file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) - endif() - if(NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}") - message(STATUS "Downloading ${PACELIB_URL}") - file(DOWNLOAD ${PACELIB_URL} ${CMAKE_BINARY_DIR}/libpace.tar.gz STATUS DL_STATUS SHOW_PROGRESS) - file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) - if((NOT DL_STATUS EQUAL 0) OR (NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}")) - message(WARNING "Download from primary URL ${PACELIB_URL} failed\nTrying fallback URL ${PACELIB_FALLBACK}") - file(DOWNLOAD ${PACELIB_FALLBACK} ${CMAKE_BINARY_DIR}/libpace.tar.gz EXPECTED_HASH MD5=${PACELIB_MD5} SHOW_PROGRESS) + # set policy to silence warnings about timestamps of downloaded files. review occasionally if it may be set to NEW + if(POLICY CMP0135) + cmake_policy(SET CMP0135 OLD) endif() - else() - message(STATUS "Using already downloaded archive ${CMAKE_BINARY_DIR}/libpace.tar.gz") - endif() + + set(PACELIB_URL "https://github.com/ICAMS/lammps-user-pace/archive/refs/tags/v.2023.11.25.fix2.tar.gz" CACHE STRING "URL for PACE evaluator library sources") + set(PACELIB_MD5 "a53bd87cfee8b07d9f44bc17aad69c3f" CACHE STRING "MD5 checksum of PACE evaluator library tarball") + mark_as_advanced(PACELIB_URL) + mark_as_advanced(PACELIB_MD5) + GetFallbackURL(PACELIB_URL PACELIB_FALLBACK) + + # LOCAL_ML-PACE points to top-level dir with local lammps-user-pace repo, + # to make it easier to check local build without going through the public github releases + if(LOCAL_ML-PACE) + set(lib-pace "${LOCAL_ML-PACE}") + else() + # download library sources to build folder + if(EXISTS ${CMAKE_BINARY_DIR}/libpace.tar.gz) + file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) + endif() + if(NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}") + message(STATUS "Downloading ${PACELIB_URL}") + file(DOWNLOAD ${PACELIB_URL} ${CMAKE_BINARY_DIR}/libpace.tar.gz STATUS DL_STATUS SHOW_PROGRESS) + file(MD5 ${CMAKE_BINARY_DIR}/libpace.tar.gz DL_MD5) + if((NOT DL_STATUS EQUAL 0) OR (NOT "${DL_MD5}" STREQUAL "${PACELIB_MD5}")) + message(WARNING "Download from primary URL ${PACELIB_URL} failed\nTrying fallback URL ${PACELIB_FALLBACK}") + file(DOWNLOAD ${PACELIB_FALLBACK} ${CMAKE_BINARY_DIR}/libpace.tar.gz EXPECTED_HASH MD5=${PACELIB_MD5} SHOW_PROGRESS) + endif() + else() + message(STATUS "Using already downloaded archive ${CMAKE_BINARY_DIR}/libpace.tar.gz") + endif() - # uncompress downloaded sources - execute_process( - COMMAND ${CMAKE_COMMAND} -E remove_directory lammps-user-pace* - COMMAND ${CMAKE_COMMAND} -E tar xzf libpace.tar.gz - WORKING_DIRECTORY ${CMAKE_BINARY_DIR} - ) - get_newest_file(${CMAKE_BINARY_DIR}/lammps-user-pace-* lib-pace) -endif() - -add_subdirectory(${lib-pace} build-pace) -set_target_properties(pace PROPERTIES CXX_EXTENSIONS ON OUTPUT_NAME lammps_pace${LAMMPS_MACHINE}) - -if(CMAKE_PROJECT_NAME STREQUAL "lammps") - target_link_libraries(lammps PRIVATE pace) + # uncompress downloaded sources + execute_process( + COMMAND ${CMAKE_COMMAND} -E remove_directory lammps-user-pace* + COMMAND ${CMAKE_COMMAND} -E tar xzf libpace.tar.gz + WORKING_DIRECTORY ${CMAKE_BINARY_DIR} + ) + get_newest_file(${CMAKE_BINARY_DIR}/lammps-user-pace-* lib-pace) + endif() + + # some preinstalled yaml-cpp versions don't provide a namespaced target + find_package(yaml-cpp QUIET) + if(TARGET yaml-cpp AND NOT TARGET yaml-cpp::yaml-cpp) + add_library(yaml-cpp::yaml-cpp ALIAS yaml-cpp) + endif() + + add_subdirectory(${lib-pace} build-pace) + set_target_properties(pace PROPERTIES CXX_EXTENSIONS ON OUTPUT_NAME lammps_pace${LAMMPS_MACHINE}) + + if(CMAKE_PROJECT_NAME STREQUAL "lammps") + target_link_libraries(lammps PRIVATE pace) + endif() endif() diff --git a/cmake/Modules/Packages/VTK.cmake b/cmake/Modules/Packages/VTK.cmake index a0de1e0ff4..48b1cdba11 100644 --- a/cmake/Modules/Packages/VTK.cmake +++ b/cmake/Modules/Packages/VTK.cmake @@ -1,3 +1,5 @@ +# FindVTK requires that C support is enabled when looking for MPI support +enable_language(C) find_package(VTK REQUIRED NO_MODULE) target_compile_definitions(lammps PRIVATE -DLAMMPS_VTK) if (VTK_MAJOR_VERSION VERSION_LESS 9.0) diff --git a/doc/src/Build.rst b/doc/src/Build.rst index 7cf2a01992..7ca8cd428e 100644 --- a/doc/src/Build.rst +++ b/doc/src/Build.rst @@ -1,10 +1,14 @@ Build LAMMPS ============ -LAMMPS is built as a library and an executable from source code using -either traditional makefiles for use with GNU make (which may require -manual editing), or using a build environment generated by CMake (Unix -Makefiles, Ninja, Xcode, Visual Studio, KDevelop, CodeBlocks and more). +LAMMPS is built as a library and an executable from source code using a +build environment generated by CMake (Unix Makefiles, Ninja, Xcode, +Visual Studio, KDevelop, CodeBlocks and more depending on the platform). +Using CMake is the preferred way to build LAMMPS. In addition, LAMMPS +can be compiled using the legacy build system based on traditional +makefiles for use with GNU make (which may require manual editing). +Support for the legacy build system is slowly being phased out and may +not be available for all optional features. As an alternative, you can download a package with pre-built executables or automated build trees, as described in the :doc:`Install ` diff --git a/doc/src/Build_basics.rst b/doc/src/Build_basics.rst index e82832cf57..7014762cdc 100644 --- a/doc/src/Build_basics.rst +++ b/doc/src/Build_basics.rst @@ -160,7 +160,7 @@ with the OpenMP 3.1 semantics used in LAMMPS for maximal compatibility with compiler versions in use. If compilation with OpenMP enabled fails because of your compiler requiring strict OpenMP 4.0 semantics, you can change the behavior by adding ``-D LAMMPS_OMP_COMPAT=4`` to the -``LMP_INC`` variable in your makefile, or add it to the command line +``LMP_INC`` variable in your makefile, or add it to the command-line flags while configuring with CMake. LAMMPS will auto-detect a suitable setting for most GNU, Clang, and Intel compilers. @@ -502,6 +502,8 @@ using CMake or Make. # chain.x, micelle2d.x, msi2lmp, phana, # stl_bin2txt -D BUILD_LAMMPS_GUI=value # yes or no (default). Build LAMMPS-GUI + -D BUILD_WHAM=value # yes (default). Download and build WHAM; + # only available for BUILD_LAMMPS_GUI=yes The generated binaries will also become part of the LAMMPS installation (see below). diff --git a/doc/src/Build_cmake.rst b/doc/src/Build_cmake.rst index 1b2bef936e..56b8e450f3 100644 --- a/doc/src/Build_cmake.rst +++ b/doc/src/Build_cmake.rst @@ -8,7 +8,7 @@ packages. Links to those pages on the :doc:`Build overview ` page. The following text assumes some familiarity with CMake and focuses on -using the command line tool ``cmake`` and what settings are supported +using the command-line tool ``cmake`` and what settings are supported for building LAMMPS. A more detailed tutorial on how to use CMake itself, the text mode or graphical user interface, to change the generated output files for different build tools and development @@ -16,7 +16,7 @@ environments is on a :doc:`separate page `. .. note:: - LAMMPS currently requires that CMake version 3.16 or later is available. + LAMMPS currently requires that CMake version 3.20 or later is available. .. warning:: @@ -32,22 +32,22 @@ environments is on a :doc:`separate page `. Advantages of using CMake ^^^^^^^^^^^^^^^^^^^^^^^^^ -CMake is an alternative to compiling LAMMPS in the traditional way -through :doc:`(manually customized) makefiles `. Using -CMake has multiple advantages that are specifically helpful for -people with limited experience in compiling software or for people -that want to modify or extend LAMMPS. +CMake is the preferred way of compiling LAMMPS in contrast to the legacy +build system based on GNU make and through :doc:`(manually customized) +makefiles `. Using CMake has multiple advantages that are +specifically helpful for people with limited experience in compiling +software or for people that want to modify or extend LAMMPS. - CMake can detect available hardware, tools, features, and libraries and adapt the LAMMPS default build configuration accordingly. - CMake can generate files for different build tools and integrated development environments (IDE). -- CMake supports customization of settings with a command line, text +- CMake supports customization of settings with a command-line, text mode, or graphical user interface. No manual editing of files, - knowledge of file formats or complex command line syntax is required. + knowledge of file formats or complex command-line syntax is required. - All enabled components are compiled in a single build operation. - Automated dependency tracking for all files and configuration options. -- Support for true out-of-source compilation. Multiple configurations +- Support for true out-of-source compilation. Multiple configurations and settings with different choices of LAMMPS packages, settings, or compilers can be configured and built concurrently from the same source tree. @@ -68,7 +68,7 @@ that purpose you can use either the command-line utility ``cmake`` (or graphical utility ``cmake-gui``, or use them interchangeably. The second step is then the compilation and linking of all objects, libraries, and executables using the selected build tool. Here is a -minimal example using the command line version of CMake to build LAMMPS +minimal example using the command-line version of CMake to build LAMMPS with no add-on packages enabled and no customization: .. code-block:: bash @@ -131,7 +131,7 @@ file called ``CMakeLists.txt`` (for LAMMPS it is located in the configuration step. The cache file contains all current CMake settings. To modify settings, enable or disable features, you need to set -*variables* with either the ``-D`` command line flag (``-D +*variables* with either the ``-D`` command-line flag (``-D VARIABLE1_NAME=value``) or change them in the text mode of the graphical user interface. The ``-D`` flag can be used several times in one command. @@ -141,11 +141,11 @@ a different compiler tool chain. Those are loaded with the ``-C`` flag (``-C ../cmake/presets/basic.cmake``). This step would only be needed once, as the settings from the preset files are stored in the ``CMakeCache.txt`` file. It is also possible to customize the build -by adding one or more ``-D`` flags to the CMake command line. +by adding one or more ``-D`` flags to the CMake command. Generating files for alternate build tools (e.g. Ninja) and project files for IDEs like Eclipse, CodeBlocks, or Kate can be selected using the ``-G`` -command line flag. A list of available generator settings for your +command-line flag. A list of available generator settings for your specific CMake version is given when running ``cmake --help``. .. _cmake_multiconfig: diff --git a/doc/src/Build_development.rst b/doc/src/Build_development.rst index 3adec76abb..5c6475c7fa 100644 --- a/doc/src/Build_development.rst +++ b/doc/src/Build_development.rst @@ -263,9 +263,9 @@ will be skipped if prerequisite features are not available in LAMMPS. time. Preference is given to parts of the code base that are easy to test or commonly used. -Tests as shown by the ``ctest`` program are command lines defined in the +Tests as shown by the ``ctest`` program are commands defined in the ``CMakeLists.txt`` files in the ``unittest`` directory tree. A few -tests simply execute LAMMPS with specific command line flags and check +tests simply execute LAMMPS with specific command-line flags and check the output to the screen for expected content. A large number of unit tests are special tests programs using the `GoogleTest framework `_ and linked to the LAMMPS @@ -420,7 +420,7 @@ during MD timestepping and manipulate per-atom properties like positions, velocities, and forces. For those fix styles, testing can be done in a very similar fashion as for force fields and thus there is a test program `test_fix_timestep` that shares a lot of code, properties, -and command line flags with the force field style testers described in +and command-line flags with the force field style testers described in the previous section. This tester will set up a small molecular system run with verlet run @@ -642,10 +642,10 @@ The following target are available for both, GNU make and CMake: .. _gh-cli: -GitHub command line interface +GitHub command-line interface ----------------------------- -GitHub has developed a `command line tool `_ +GitHub has developed a `command-line tool `_ to interact with the GitHub website via a command called ``gh``. This is extremely convenient when working with a Git repository hosted on GitHub (like LAMMPS). It is thus highly recommended to install it diff --git a/doc/src/Build_extras.rst b/doc/src/Build_extras.rst index 8465bea829..dab2267ee8 100644 --- a/doc/src/Build_extras.rst +++ b/doc/src/Build_extras.rst @@ -48,6 +48,7 @@ This is the list of packages that may require additional steps. * :ref:`LEPTON ` * :ref:`MACHDYN ` * :ref:`MDI ` + * :ref:`MISC ` * :ref:`ML-HDNNP ` * :ref:`ML-IAP ` * :ref:`ML-PACE ` @@ -209,7 +210,7 @@ necessary for ``hipcc`` and the linker to work correctly. Using the CHIP-SPV implementation of HIP is supported. It allows one to run HIP code on Intel GPUs via the OpenCL or Level Zero back ends. To use CHIP-SPV, you must set ``-DHIP_USE_DEVICE_SORT=OFF`` in your CMake -command line as CHIP-SPV does not yet support hipCUB. As of Summer 2022, +command-line as CHIP-SPV does not yet support hipCUB. As of Summer 2022, the use of HIP for Intel GPUs is experimental. You should only use this option in preparations to run on Aurora system at Argonne. @@ -232,7 +233,7 @@ option in preparations to run on Aurora system at Argonne. .. code:: bash - # CUDA target (not recommended, use GPU_ARCH=cuda) + # CUDA target (not recommended, use GPU_API=cuda) # !!! DO NOT set CMAKE_CXX_COMPILER !!! export HIP_PLATFORM=nvcc export HIP_PATH=/path/to/HIP/install @@ -421,9 +422,10 @@ minutes to hours) to build. Of course you only need to do that once.) cmake build system. The ``lib/kim/Install.py`` script supports a ``CMAKE`` environment variable if the cmake executable is named other than ``cmake`` on your system. Additional environment variables may be - provided on the command line for use by cmake. For example, to use the - ``cmake3`` executable and tell it to use the gnu version 11 compilers - to build KIM, one could use the following command line. + set with the ``make`` command for use by cmake. For example, to use the + ``cmake3`` executable and tell it to use the GNU version 11 compilers + called ``g++-11``, ``gcc-11`` and ``gfortran-11`` to build KIM, one + could use the following command. .. code-block:: bash @@ -546,16 +548,7 @@ They must be specified in uppercase. - Local machine * - AMDAVX - HOST - - AMD 64-bit x86 CPU (AVX 1) - * - ZEN - - HOST - - AMD Zen class CPU (AVX 2) - * - ZEN2 - - HOST - - AMD Zen2 class CPU (AVX 2) - * - ZEN3 - - HOST - - AMD Zen3 class CPU (AVX 2) + - AMD chip * - ARMV80 - HOST - ARMv8.0 Compatible CPU @@ -571,105 +564,126 @@ They must be specified in uppercase. * - A64FX - HOST - ARMv8.2 with SVE Support + * - ARMV9_GRACE + - HOST + - ARMv9 NVIDIA Grace CPU * - SNB - HOST - - Intel Sandy/Ivy Bridge CPU (AVX 1) + - Intel Sandy/Ivy Bridge CPUs * - HSW - HOST - - Intel Haswell CPU (AVX 2) + - Intel Haswell CPUs * - BDW - HOST - - Intel Broadwell Xeon E-class CPU (AVX 2 + transactional mem) - * - SKL - - HOST - - Intel Skylake Client CPU - * - SKX - - HOST - - Intel Skylake Xeon Server CPU (AVX512) + - Intel Broadwell Xeon E-class CPUs * - ICL - HOST - - Intel Ice Lake Client CPU (AVX512) + - Intel Ice Lake Client CPUs (AVX512) * - ICX - HOST - - Intel Ice Lake Xeon Server CPU (AVX512) - * - SPR + - Intel Ice Lake Xeon Server CPUs (AVX512) + * - SKL - HOST - - Intel Sapphire Rapids Xeon Server CPU (AVX512) + - Intel Skylake Client CPUs + * - SKX + - HOST + - Intel Skylake Xeon Server CPUs (AVX512) * - KNC - HOST - Intel Knights Corner Xeon Phi * - KNL - HOST - Intel Knights Landing Xeon Phi + * - SPR + - HOST + - Intel Sapphire Rapids Xeon Server CPUs (AVX512) * - POWER8 - HOST - - IBM POWER8 CPU + - IBM POWER8 CPUs * - POWER9 - HOST - - IBM POWER9 CPU + - IBM POWER9 CPUs + * - ZEN + - HOST + - AMD Zen architecture + * - ZEN2 + - HOST + - AMD Zen2 architecture + * - ZEN3 + - HOST + - AMD Zen3 architecture * - RISCV_SG2042 - HOST - - SG2042 (RISC-V) CPU + - SG2042 (RISC-V) CPUs + * - RISCV_RVA22V + - HOST + - RVA22V (RISC-V) CPUs * - KEPLER30 - GPU - - NVIDIA Kepler generation CC 3.0 GPU + - NVIDIA Kepler generation CC 3.0 * - KEPLER32 - GPU - - NVIDIA Kepler generation CC 3.2 GPU + - NVIDIA Kepler generation CC 3.2 * - KEPLER35 - GPU - - NVIDIA Kepler generation CC 3.5 GPU + - NVIDIA Kepler generation CC 3.5 * - KEPLER37 - GPU - - NVIDIA Kepler generation CC 3.7 GPU + - NVIDIA Kepler generation CC 3.7 * - MAXWELL50 - GPU - - NVIDIA Maxwell generation CC 5.0 GPU + - NVIDIA Maxwell generation CC 5.0 * - MAXWELL52 - GPU - - NVIDIA Maxwell generation CC 5.2 GPU + - NVIDIA Maxwell generation CC 5.2 * - MAXWELL53 - GPU - - NVIDIA Maxwell generation CC 5.3 GPU + - NVIDIA Maxwell generation CC 5.3 * - PASCAL60 - GPU - - NVIDIA Pascal generation CC 6.0 GPU + - NVIDIA Pascal generation CC 6.0 * - PASCAL61 - GPU - - NVIDIA Pascal generation CC 6.1 GPU + - NVIDIA Pascal generation CC 6.1 * - VOLTA70 - GPU - - NVIDIA Volta generation CC 7.0 GPU + - NVIDIA Volta generation CC 7.0 * - VOLTA72 - GPU - - NVIDIA Volta generation CC 7.2 GPU + - NVIDIA Volta generation CC 7.2 * - TURING75 - GPU - - NVIDIA Turing generation CC 7.5 GPU + - NVIDIA Turing generation CC 7.5 * - AMPERE80 - GPU - - NVIDIA Ampere generation CC 8.0 GPU + - NVIDIA Ampere generation CC 8.0 * - AMPERE86 - GPU - - NVIDIA Ampere generation CC 8.6 GPU + - NVIDIA Ampere generation CC 8.6 * - ADA89 - GPU - - NVIDIA Ada Lovelace generation CC 8.9 GPU + - NVIDIA Ada generation CC 8.9 * - HOPPER90 - GPU - - NVIDIA Hopper generation CC 9.0 GPU + - NVIDIA Hopper generation CC 9.0 * - AMD_GFX906 - GPU - - AMD GPU MI50/MI60 + - AMD GPU MI50/60 * - AMD_GFX908 - GPU - AMD GPU MI100 * - AMD_GFX90A - GPU - AMD GPU MI200 + * - AMD_GFX940 + - GPU + - AMD GPU MI300 * - AMD_GFX942 - GPU - AMD GPU MI300 + * - AMD_GFX942_APU + - GPU + - AMD APU MI300A * - AMD_GFX1030 - GPU - AMD GPU V620/W6800 @@ -678,7 +692,7 @@ They must be specified in uppercase. - AMD GPU RX7900XTX * - AMD_GFX1103 - GPU - - AMD Phoenix APU with Radeon 740M/760M/780M/880M/890M + - AMD APU Phoenix * - INTEL_GEN - GPU - SPIR64-based devices, e.g. Intel GPUs, using JIT @@ -701,7 +715,7 @@ They must be specified in uppercase. - GPU - Intel GPU Ponte Vecchio -This list was last updated for version 4.3.0 of the Kokkos library. +This list was last updated for version 4.5.1 of the Kokkos library. .. tabs:: @@ -2018,7 +2032,7 @@ TBB and MKL. .. _mdi: MDI package ------------------------------ +----------- .. tabs:: @@ -2045,6 +2059,37 @@ MDI package ---------- +.. _misc: + +MISC package +------------ + +The :doc:`fix imd ` style in this package can be run either +synchronously (communication with IMD clients is done in the main +process) or asynchronously (the fix spawns a separate thread that can +communicate with IMD clients concurrently to the LAMMPS execution). + +.. tabs:: + + .. tab:: CMake build + + .. code-block:: bash + + -D LAMMPS_ASYNC_IMD=value # Run IMD server asynchronously + # value = no (default) or yes + + .. tab:: Traditional make + + To enable asynchronous mode the ``-DLAMMPS_ASYNC_IMD`` define + needs to be added to the ``LMP_INC`` variable in the + ``Makefile.machine`` you are using. For example: + + .. code-block:: make + + LMP_INC = -DLAMMPS_ASYNC_IMD -DLAMMPS_MEMALIGN=64 + +---------- + .. _molfile: MOLFILE package @@ -2191,7 +2236,7 @@ verified to work in February 2020 with Quantum Espresso versions 6.3 to from the sources in the *lib* folder (including the essential libqmmm.a) are not included in the static LAMMPS library and (currently) not installed, while their code is included in the - shared LAMMPS library. Thus a typical command line to configure + shared LAMMPS library. Thus a typical command to configure building LAMMPS for QMMM would be: .. code-block:: bash diff --git a/doc/src/Build_make.rst b/doc/src/Build_make.rst index 932050d410..00f2f0b24d 100644 --- a/doc/src/Build_make.rst +++ b/doc/src/Build_make.rst @@ -8,6 +8,10 @@ Building LAMMPS with traditional makefiles requires that you have a for customizing your LAMMPS build with a number of global compilation options and features. +This build system is slowly being phased out and may not support all +optional features and packages in LAMMPS. It is recommended to switch +to the :doc:`CMake based build system `. + Requirements ^^^^^^^^^^^^ diff --git a/doc/src/Build_package.rst b/doc/src/Build_package.rst index 8b2da8ad30..c4c4889806 100644 --- a/doc/src/Build_package.rst +++ b/doc/src/Build_package.rst @@ -49,6 +49,7 @@ packages: * :ref:`LEPTON ` * :ref:`MACHDYN ` * :ref:`MDI ` + * :ref:`MISC ` * :ref:`ML-HDNNP ` * :ref:`ML-IAP ` * :ref:`ML-PACE ` diff --git a/doc/src/Build_windows.rst b/doc/src/Build_windows.rst index d1748825eb..27562db480 100644 --- a/doc/src/Build_windows.rst +++ b/doc/src/Build_windows.rst @@ -100,9 +100,9 @@ procedure. It is possible to use both the integrated CMake support of the Visual Studio IDE or use an external CMake installation (e.g. downloaded from -cmake.org) to create build files and compile LAMMPS from the command line. +cmake.org) to create build files and compile LAMMPS from the command-line. -Compilation via command line and unit tests are checked automatically +Compilation via command-line and unit tests are checked automatically for the LAMMPS development branch through `GitHub Actions `_. @@ -115,7 +115,7 @@ for the LAMMPS development branch through Please note, that for either approach CMake will create a so-called :ref:`"multi-configuration" build environment `, and -the command lines for building and testing LAMMPS must be adjusted +the commands for building and testing LAMMPS must be adjusted accordingly. The LAMMPS cmake folder contains a ``CMakeSettings.json`` file with diff --git a/doc/src/Classes_lammps.rst b/doc/src/Classes_lammps.rst index dd6798d9a1..167a619fae 100644 --- a/doc/src/Classes_lammps.rst +++ b/doc/src/Classes_lammps.rst @@ -4,7 +4,7 @@ LAMMPS Class The LAMMPS class is encapsulating an MD simulation state and thus it is the class that needs to be created when starting a new simulation system state. The LAMMPS executable essentially creates one instance of this -class and passes the command line flags and tells it to process the +class and passes the command-line flags and tells it to process the provided input (a file or ``stdin``). It shuts the class down when control is returned to it and then exits. When using LAMMPS as a library from another code it is required to create an instance of this diff --git a/doc/src/Commands_bond.rst b/doc/src/Commands_bond.rst index 3be40a6158..40532bdef7 100644 --- a/doc/src/Commands_bond.rst +++ b/doc/src/Commands_bond.rst @@ -90,6 +90,7 @@ OPT. * :doc:`lepton (o) ` * :doc:`mesocnt ` * :doc:`mm3 ` + * :doc:`mwlc ` * :doc:`quartic (o) ` * :doc:`spica (ko) ` * :doc:`table (o) ` diff --git a/doc/src/Commands_input.rst b/doc/src/Commands_input.rst index dc0fb72dd9..0f6e8f94a2 100644 --- a/doc/src/Commands_input.rst +++ b/doc/src/Commands_input.rst @@ -69,7 +69,7 @@ WARNING message is printed. The :doc:`Errors ` page gives more information on what errors mean. The documentation for each command lists restrictions on how the command can be used. -You can use the :ref:`-skiprun ` command line flag +You can use the :ref:`-skiprun ` command-line flag to have LAMMPS skip the execution of any ``run``, ``minimize``, or similar commands to check the entire input for correct syntax to avoid crashes on typos or syntax errors in long runs. diff --git a/doc/src/Commands_removed.rst b/doc/src/Commands_removed.rst index ea8b3d4b03..be775f4c19 100644 --- a/doc/src/Commands_removed.rst +++ b/doc/src/Commands_removed.rst @@ -1,6 +1,10 @@ Removed commands and packages ============================= +.. contents:: + +------ + This page lists LAMMPS commands and packages that have been removed from the distribution and provides suggestions for alternatives or replacements. LAMMPS has special dummy styles implemented, that will @@ -8,47 +12,60 @@ stop LAMMPS and print a suitable error message in most cases, when a style/command is used that has been removed or will replace the command with the direct alternative (if available) and print a warning. -restart2data tool ------------------ +LAMMPS shell +------------ -.. versionchanged:: 23Nov2013 +.. versionchanged:: 29Aug2024 -The functionality of the restart2data tool has been folded into the -LAMMPS executable directly instead of having a separate tool. A -combination of the commands :doc:`read_restart ` and -:doc:`write_data ` can be used to the same effect. For -added convenience this conversion can also be triggered by -:doc:`command line flags ` +The LAMMPS shell has been removed from the LAMMPS distribution. Users +are encouraged to use the :ref:`LAMMPS-GUI ` tool instead. -Fix ave/spatial and fix ave/spatial/sphere ------------------------------------------- +i-PI tool +--------- -.. deprecated:: 11Dec2015 +.. versionchanged:: 27Jun2024 -The fixes ave/spatial and ave/spatial/sphere have been removed from LAMMPS -since they were superseded by the more general and extensible "chunk -infrastructure". Here the system is partitioned in one of many possible -ways through the :doc:`compute chunk/atom ` command -and then averaging is done using :doc:`fix ave/chunk `. -Please refer to the :doc:`chunk HOWTO ` section for an overview. +The i-PI tool has been removed from the LAMMPS distribution. Instead, +instructions to install i-PI from PyPI via pip are provided. -Box command ------------ +USER-REAXC package +------------------ -.. deprecated:: 22Dec2022 +.. deprecated:: 7Feb2024 -The *box* command has been removed and the LAMMPS code changed so it won't -be needed. If present, LAMMPS will ignore the command and print a warning. +The USER-REAXC package has been renamed to :ref:`REAXFF `. +In the process also the pair style and related fixes were renamed to use +the "reaxff" string instead of "reax/c". For a while LAMMPS was maintaining +backward compatibility by providing aliases for the styles. These have +been removed, so using "reaxff" is now *required*. -Reset_ids, reset_atom_ids, reset_mol_ids commands -------------------------------------------------- +MPIIO package +------------- -.. deprecated:: 22Dec2022 +.. deprecated:: 21Nov2023 -The *reset_ids*, *reset_atom_ids*, and *reset_mol_ids* commands have -been folded into the :doc:`reset_atoms ` command. If -present, LAMMPS will replace the commands accordingly and print a -warning. +The MPIIO package has been removed from LAMMPS since it was unmaintained +for many years and thus not updated to incorporate required changes that +had been applied to the corresponding non-MPIIO commands. As a +consequence the MPIIO commands had become unreliable and sometimes +crashing LAMMPS or corrupting data. Similar functionality is available +through the :ref:`ADIOS package ` and the :ref:`NETCDF +package `. Also, the :doc:`dump_modify nfile or dump_modify +fileper ` keywords may be used for an efficient way of +writing out dump files when running on large numbers of processors. +Similarly, the "nfile" and "fileper" keywords exist for restarts: +see :doc:`restart `, :doc:`read_restart `, +:doc:`write_restart `. + +MSCG package +------------ + +.. deprecated:: 21Nov2023 + +The MSCG package has been removed from LAMMPS since it was unmaintained +for many years and instead superseded by the `OpenMSCG software +`_ of the Voth group at the +University of Chicago, which can be used independent from LAMMPS. LATTE package ------------- @@ -64,18 +81,6 @@ packages, including LATTE. See the ``examples/QUANTUM`` dir and the with LATTE as a plugin library (similar to the way fix latte worked), as well as on a different set of MPI processors. -MEAM package ------------- - -The MEAM package in Fortran has been replaced by a C++ implementation. -The code in the :ref:`MEAM package ` is a translation of the -Fortran code of MEAM into C++, which removes several restrictions -(e.g. there can be multiple instances in hybrid pair styles) and allows -for some optimizations leading to better performance. The pair style -:doc:`meam ` has the exact same syntax. For a transition -period the C++ version of MEAM was called USER-MEAMC so it could -coexist with the Fortran version. - Minimize style fire/old ----------------------- @@ -97,38 +102,38 @@ The same functionality is available through :doc:`bond style mesocnt ` and :doc:`angle style mesocnt `. -MPIIO package -------------- +Box command +----------- -.. deprecated:: 21Nov2023 +.. deprecated:: 22Dec2022 -The MPIIO package has been removed from LAMMPS since it was unmaintained -for many years and thus not updated to incorporate required changes that -had been applied to the corresponding non-MPIIO commands. As a -consequence the MPIIO commands had become unreliable and sometimes -crashing LAMMPS or corrupting data. Similar functionality is available -through the :ref:`ADIOS package ` and the :ref:`NETCDF -package `. Also, the :doc:`dump_modify nfile or dump_modify -fileper ` keywords may be used for an efficient way of -writing out dump files when running on large numbers of processors. -Similarly, the "nfile" and "fileper" keywords exist for restarts: -see :doc:`restart `, :doc:`read_restart `, -:doc:`write_restart `. +The *box* command has been removed and the LAMMPS code changed so it won't +be needed. If present, LAMMPS will ignore the command and print a warning. +Reset_ids, reset_atom_ids, reset_mol_ids commands +------------------------------------------------- -MSCG package ------------- +.. deprecated:: 22Dec2022 -.. deprecated:: 21Nov2023 +The *reset_ids*, *reset_atom_ids*, and *reset_mol_ids* commands have +been folded into the :doc:`reset_atoms ` command. If +present, LAMMPS will replace the commands accordingly and print a +warning. -The MSCG package has been removed from LAMMPS since it was unmaintained -for many years and instead superseded by the `OpenMSCG software -`_ of the Voth group at the -University of Chicago, which can be used independent from LAMMPS. +MESSAGE package +--------------- + +.. deprecated:: 4May2022 + +The MESSAGE package has been removed since it was superseded by the +:ref:`MDI package `. MDI implements the same functionality +and in a more general way with direct support for more applications. REAX package ------------ +.. deprecated:: 4Jan2019 + The REAX package has been removed since it was superseded by the :ref:`REAXFF package `. The REAXFF package has been tested to yield equivalent results to the REAX package, offers better @@ -138,20 +143,25 @@ syntax compatible with the removed reax pair style, so input files will have to be adapted. The REAXFF package was originally called USER-REAXC. -USER-REAXC package ------------------- +MEAM package +------------ -.. deprecated:: 7Feb2024 +.. deprecated:: 4Jan2019 -The USER-REAXC package has been renamed to :ref:`REAXFF `. -In the process also the pair style and related fixes were renamed to use -the "reaxff" string instead of "reax/c". For a while LAMMPS was maintaining -backward compatibility by providing aliases for the styles. These have -been removed, so using "reaxff" is now *required*. +The MEAM package in Fortran has been replaced by a C++ implementation. +The code in the :ref:`MEAM package ` is a translation of the +Fortran code of MEAM into C++, which removes several restrictions +(e.g. there can be multiple instances in hybrid pair styles) and allows +for some optimizations leading to better performance. The pair style +:doc:`meam ` has the exact same syntax. For a transition +period the C++ version of MEAM was called USER-MEAMC so it could +coexist with the Fortran version. USER-CUDA package ----------------- +.. deprecated:: 31May2016 + The USER-CUDA package had been removed, since it had been unmaintained for a long time and had known bugs and problems. Significant parts of the design were transferred to the @@ -160,19 +170,27 @@ performance characteristics on NVIDIA GPUs. Both, the KOKKOS and the :ref:`GPU package ` are maintained and allow running LAMMPS with GPU acceleration. -i-PI tool ---------- +Fix ave/spatial and fix ave/spatial/sphere +------------------------------------------ -.. versionchanged:: 27Jun2024 +.. deprecated:: 11Dec2015 -The i-PI tool has been removed from the LAMMPS distribution. Instead, -instructions to install i-PI from PyPI via pip are provided. +The fixes ave/spatial and ave/spatial/sphere have been removed from LAMMPS +since they were superseded by the more general and extensible "chunk +infrastructure". Here the system is partitioned in one of many possible +ways through the :doc:`compute chunk/atom ` command +and then averaging is done using :doc:`fix ave/chunk `. +Please refer to the :doc:`chunk HOWTO ` section for an overview. -LAMMPS shell ------------- +restart2data tool +----------------- -.. versionchanged:: 29Aug2024 +.. deprecated:: 23Nov2013 -The LAMMPS shell has been removed from the LAMMPS distribution. Users -are encouraged to use the :ref:`LAMMPS-GUI ` tool instead. +The functionality of the restart2data tool has been folded into the +LAMMPS executable directly instead of having a separate tool. A +combination of the commands :doc:`read_restart ` and +:doc:`write_data ` can be used to the same effect. For +added convenience this conversion can also be triggered by +:doc:`command-line flags ` diff --git a/doc/src/Developer_org.rst b/doc/src/Developer_org.rst index f1804655ae..93632ae03f 100644 --- a/doc/src/Developer_org.rst +++ b/doc/src/Developer_org.rst @@ -94,12 +94,12 @@ represents what is generally referred to as an "instance of LAMMPS". It is a composite holding pointers to instances of other core classes providing the core functionality of the MD engine in LAMMPS and through them abstractions of the required operations. The constructor of the -LAMMPS class will instantiate those instances, process the command line +LAMMPS class will instantiate those instances, process the command-line flags, initialize MPI (if not already done) and set up file pointers for input and output. The destructor will shut everything down and free all associated memory. Thus code for the standalone LAMMPS executable in ``main.cpp`` simply initializes MPI, instantiates a single instance of -LAMMPS while passing it the command line flags and input script. It +LAMMPS while passing it the command-line flags and input script. It deletes the LAMMPS instance after the method reading the input returns and shuts down the MPI environment before it exits the executable. diff --git a/doc/src/Developer_unittest.rst b/doc/src/Developer_unittest.rst index 07ffb0e2d8..bee0c45480 100644 --- a/doc/src/Developer_unittest.rst +++ b/doc/src/Developer_unittest.rst @@ -227,12 +227,12 @@ Tests for the C-style library interface ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ Tests for validating the LAMMPS C-style library interface are in the -``unittest/c-library`` folder. They are implemented either to be used -for utility functions or for LAMMPS commands, but use the functions -implemented in the ``src/library.cpp`` file as much as possible. There -may be some overlap with other tests, but only in as much as is required -to test the C-style library API. The tests are distributed over -multiple test programs which try to match the grouping of the +``unittest/c-library`` folder. They text either utility functions or +LAMMPS commands, but use the functions implemented in +``src/library.cpp`` as much as possible. There may be some overlap with +other tests as far as the LAMMPS functionality is concerned, but the +focus is on testing the C-style library API. The tests are distributed +over multiple test programs which try to match the grouping of the functions in the source code and :ref:`in the manual `. This group of tests also includes tests invoking LAMMPS in parallel @@ -258,7 +258,7 @@ Tests for the Python module and package The ``unittest/python`` folder contains primarily tests for classes and functions in the LAMMPS python module but also for commands in the -PYTHON package. These tests are only enabled if the necessary +PYTHON package. These tests are only enabled, if the necessary prerequisites are detected or enabled during configuration and compilation of LAMMPS (shared library build enabled, Python interpreter found, Python development files found). @@ -272,29 +272,30 @@ Tests for the Fortran interface Tests for using the Fortran module are in the ``unittest/fortran`` folder. Since they are also using the GoogleTest library, they require -implementing test wrappers in C++ that will call fortran functions -which provide a C function interface through ISO_C_BINDINGS that will in -turn call the functions in the LAMMPS Fortran module. +test wrappers written in C++ that will call fortran functions with a C +function interface through ISO_C_BINDINGS which will in turn call the +functions in the LAMMPS Fortran module. Tests for the C++-style library interface ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The tests in the ``unittest/cplusplus`` folder are somewhat similar to the tests for the C-style library interface, but do not need to test the -several convenience and utility functions that are only available through -the C-style interface. Instead it can focus on the more generic features -that are used internally. This part of the unit tests is currently still -mostly in the planning stage. +convenience and utility functions that are only available through the +C-style library interface. Instead they focus on the more generic +features that are used in LAMMPS internally. This part of the unit +tests is currently still mostly in the planning stage. Tests for reading and writing file formats ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ The ``unittest/formats`` folder contains test programs for reading and -writing files like data files, restart files, potential files or dump files. -This covers simple things like the file i/o convenience functions in the -``utils::`` namespace to complex tests of atom styles where creating and -deleting atoms with different properties is tested in different ways -and through script commands or reading and writing of data or restart files. +writing files like data files, restart files, potential files or dump +files. This covers simple things like the file i/o convenience +functions in the ``utils::`` namespace to complex tests of atom styles +where creating and deleting of atoms with different properties is tested +in different ways and through script commands or reading and writing of +data or restart files. Tests for styles computing or modifying forces ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -443,7 +444,7 @@ file for a style that is similar to one to be tested. The file name should follow the naming conventions described above and after copying the file, the first step is to replace the style names where needed. The coefficient values do not have to be meaningful, just in a reasonable range for the -given system. It does not matter if some forces are large, as long as +given system. It does not matter if some forces are large, for as long as they do not diverge. The template input files define a large number of index variables at the top @@ -476,7 +477,7 @@ the tabulated coulomb, to test both code paths. The reference results in the YA files then should be compared manually, if they agree well enough within the limits of those two approximations. -The ``test_pair_style`` and equivalent programs have special command line options +The ``test_pair_style`` and equivalent programs have special command-line options to update the YAML files. Running a command like .. code-block:: bash @@ -531,19 +532,20 @@ Python module. Troubleshooting failed unit tests ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -The are by default no unit tests for newly added features (e.g. pair, fix, -or compute styles) unless your pull request also includes tests for the -added features. If you are modifying some features, you may see failures -for existing tests, if your modifications have some unexpected side effects -or your changes render the existing test invalid. If you are adding an -accelerated version of an existing style, then only tests for INTEL, -KOKKOS (with OpenMP only), OPENMP, and OPT will be run automatically. -Tests for the GPU package are time consuming and thus are only run -*after* a merge, or when a special label, ``gpu_unit_tests`` is added -to the pull request. After the test has started, it is often best to -remove the label since every PR activity will re-trigger the test (that -is a limitation of triggering a test with a label). Support for unit -tests when using KOKKOS with GPU acceleration is currently not supported. +There are by default no unit tests for newly added features (e.g. pair, +fix, or compute styles) unless your pull request also includes tests for +these added features. If you are modifying some existing LAMMPS +features, you may see failures for existing tests, if your modifications +have some unexpected side effects or your changes render the existing +test invalid. If you are adding an accelerated version of an existing +style, then only tests for INTEL, KOKKOS (with OpenMP only), OPENMP, and +OPT will be run automatically. Tests for the GPU package are time +consuming and thus are only run *after* a merge, or when a special +label, ``gpu_unit_tests`` is added to the pull request. After the test +has started, it is often best to remove the label since every PR +activity will re-trigger the test (that is a limitation of triggering a +test with a label). Support for unit tests using KOKKOS with GPU +acceleration is currently not supported. When you see a failed build on GitHub, click on ``Details`` to be taken to the corresponding LAMMPS Jenkins CI web page. Click on the "Exit" @@ -588,7 +590,7 @@ While the epsilon (relative precision) for a single, `IEEE 754 compliant `_, double precision floating point operation is at about 2.2e-16, the achievable precision for the tests is lower due to most numbers being sums over intermediate results -and the non-associativity of floating point math leading to larger +for which the non-associativity of floating point math leads to larger errors. As a rule of thumb, the test epsilon can often be in the range 5.0e-14 to 1.0e-13. But for "noisy" force kernels, e.g. those a larger amount of arithmetic operations involving `exp()`, `log()` or `sin()` @@ -602,14 +604,14 @@ of floating point operations or that some or most intermediate operations may be done using approximations or with single precision floating point math. -To rerun the failed unit test individually, change to the ``build`` directory +To rerun a failed unit test individually, change to the ``build`` directory and run the test with verbose output. For example, .. code-block:: bash env TEST_ARGS=-v ctest -R ^MolPairStyle:lj_cut_coul_long -V -``ctest`` with the ``-V`` flag also shows the exact command line +``ctest`` with the ``-V`` flag also shows the exact command of the test. One can then use ``gdb --args`` to further debug and catch exceptions with the test command, for example, diff --git a/doc/src/Developer_write_pair.rst b/doc/src/Developer_write_pair.rst index 5d5e081042..90b716d9c4 100644 --- a/doc/src/Developer_write_pair.rst +++ b/doc/src/Developer_write_pair.rst @@ -310,7 +310,7 @@ the constructor and the destructor. Pair styles are different from most classes in LAMMPS that define a "style", as their constructor only uses the LAMMPS class instance -pointer as an argument, but **not** the command line arguments of the +pointer as an argument, but **not** the arguments of the :doc:`pair_style command `. Instead, those arguments are processed in the ``Pair::settings()`` function (or rather the version in the derived class). The constructor is the place where global defaults @@ -891,7 +891,7 @@ originally created from mixing or not). These data file output functions are only useful for true pair-wise additive potentials, where the potential parameters can be entered through *multiple* :doc:`pair_coeff commands `. Pair styles -that require a single "pair_coeff \* \*" command line are not compatible +that require a single "pair_coeff \* \*" command are not compatible with reading their parameters from data files. For pair styles like *born/gauss* that do support writing to data files, the potential parameters will be read from the data file, if present, and @@ -1122,7 +1122,7 @@ once. Thus, the ``coeff()`` function has to do three tasks, each of which is delegated to a function in the ``PairTersoff`` class: #. map elements to atom types. Those follow the potential file name in the - command line arguments and are processed by the ``map_element2type()`` function. + command arguments and are processed by the ``map_element2type()`` function. #. read and parse the potential parameter file in the ``read_file()`` function. #. Build data structures where the original and derived parameters are indexed by all possible triples of atom types and thus can be looked @@ -1356,8 +1356,8 @@ either 0 or 1. The ``morseflag`` variable defaults to 0 and is set to 1 in the ``PairAIREBOMorse::settings()`` function which is called by the -:doc:`pair_style ` command. This function delegates -all command line processing and setting of other parameters to the +:doc:`pair_style ` command. This function delegates all +command argument processing and setting of other parameters to the ``PairAIREBO::settings()`` function of the base class. .. code-block:: c++ diff --git a/doc/src/Errors_debug.rst b/doc/src/Errors_debug.rst index b3f618dbf7..61fb1f7525 100644 --- a/doc/src/Errors_debug.rst +++ b/doc/src/Errors_debug.rst @@ -83,7 +83,7 @@ Run LAMMPS from within the debugger Running LAMMPS under the control of the debugger as shown below only works for a single MPI rank (for debugging a program running in parallel you usually need a parallel debugger program). A simple way to launch -GDB is to prefix the LAMMPS command line with ``gdb --args`` and then +GDB is to prefix the LAMMPS command-line with ``gdb --args`` and then type the command "run" at the GDB prompt. This will launch the debugger, load the LAMMPS executable and its debug info, and then run it. When it reaches the code causing the segmentation fault, it will @@ -180,7 +180,7 @@ inspect the behavior of a compiled program by essentially emulating a CPU and instrumenting the program while running. This slows down execution quite significantly, but can also report issues that are not resulting in a crash. The default valgrind tool is a memory checker and -you can use it by prefixing the normal command line with ``valgrind``. +you can use it by prefixing the normal command-line with ``valgrind``. Unlike GDB, this will also work for parallel execution, but it is recommended to redirect the valgrind output to a file (e.g. with ``--log-file=crash-%p.txt``, the %p will be substituted with the @@ -235,3 +235,53 @@ from GDB. In addition you get a more specific hint about what cause the segmentation fault, i.e. that it is a NULL pointer dereference. To find out which pointer exactly was NULL, you need to use the debugger, though. +Debugging when LAMMPS appears to be stuck +========================================= + +Sometimes the LAMMPS calculation appears to be stuck, that is the LAMMPS +process or processes are active, but there is no visible progress. This +can have multiple reasons: + +- The selected styles are slow and require a lot of CPU time and the + system is large. When extrapolating the expected speed from smaller + systems, one has to factor in that not all models scale linearly with + system size, e.g. :doc:`kspace styles like ewald or pppm + `. There is very little that can be done in this case. +- The output interval is not set or set to a large value with the + :doc:`thermo ` command. I the first case, there will be output + only at the first and last step. +- The output is block-buffered and instead of line-buffered. The output + will only be written to the screen after 4096 or 8192 characters of + output have accumulated. This most often happens for files but also + with MPI parallel executables for output to the screen, since the + output to the screen is handled by the MPI library so that output from + all processes can be shown. This can be suppressed by using the + ``-nonblock`` or ``-nb`` command-line flag, which turns off buffering + for screen and logfile output. +- An MPI parallel calculation has a bug where a collective MPI function + is called (e.g. ``MPI_Barrier()``, ``MPI_Bcast()``, + ``MPI_Allreduce()`` and so on) before pending point-to-point + communications are completed or when the collective function is only + called from a subset of the MPI processes. This also applies to some + internal LAMMPS functions like ``Error::all()`` which uses + ``MPI_Barrier()`` and thus ``Error::one()`` must be called, if the + error condition does not happen on all MPI processes simultaneously. +- Some function in LAMMPS has a bug where a ``for`` or ``while`` loop + does not trigger the exit condition and thus will loop forever. This + can happen when the wrong variable is incremented or when one value in + a comparison becomes ``NaN`` due to an overflow. + +In the latter two cases, further information and stack traces (see above) +can be obtain by attaching a debugger to a running process. For that the +process ID (PID) is needed; this can be found on Linux machines with the +``top``, ``htop``, ``ps``, or ``pstree`` commands. + +Then running the (GNU) debugger ``gdb`` with the ``-p`` flag followed by +the process id will attach the process to the debugger and stop +execution of that specific process. From there on it is possible to +issue all debugger commands in the same way as when LAMMPS was started +from the debugger (see above). Most importantly it is possible to +obtain a stack trace with the ``where`` command and thus determine where +in the execution of a timestep this process is. Also internal data can +be printed and execution single stepped or continued. When the debugger +is exited, the calculation will resume normally. diff --git a/doc/src/Errors_messages.rst b/doc/src/Errors_messages.rst index 2468b3c628..bfc395067a 100644 --- a/doc/src/Errors_messages.rst +++ b/doc/src/Errors_messages.rst @@ -7774,7 +7774,7 @@ Doc page with :doc:`WARNING messages ` *Too few values in body section of molecule file* Self-explanatory. -*Too many -pk arguments in command line* +*Too many -pk arguments in command-line* The string formed by concatenating the arguments is too long. Use a package command in the input script instead. diff --git a/doc/src/Examples.rst b/doc/src/Examples.rst index 3d2103fd6f..683cbd0500 100644 --- a/doc/src/Examples.rst +++ b/doc/src/Examples.rst @@ -146,6 +146,8 @@ Lowercase directories +-------------+------------------------------------------------------------------+ | streitz | use of Streitz/Mintmire potential with charge equilibration | +-------------+------------------------------------------------------------------+ +| stress_vcm | removing binned rigid body motion from binned stress profile | ++-------------+------------------------------------------------------------------+ | tad | temperature-accelerated dynamics of vacancy diffusion in bulk Si | +-------------+------------------------------------------------------------------+ | threebody | regression test input for a variety of manybody potentials | diff --git a/doc/src/Fortran.rst b/doc/src/Fortran.rst index a1ea9a72df..bc641a237f 100644 --- a/doc/src/Fortran.rst +++ b/doc/src/Fortran.rst @@ -16,7 +16,7 @@ compiled alongside the code using it from the source code in ``fortran/lammps.f90`` *and* with the same compiler used to build the rest of the Fortran code that interfaces to LAMMPS. When linking, you also need to :doc:`link to the LAMMPS library `. A typical -command line for a simple program using the Fortran interface would be: +command for a simple program using the Fortran interface would be: .. code-block:: bash @@ -91,12 +91,12 @@ function and triggered with the optional logical argument set to CALL lmp%close(.TRUE.) END PROGRAM testlib -It is also possible to pass command line flags from Fortran to C/C++ and +It is also possible to pass command-line flags from Fortran to C/C++ and thus make the resulting executable behave similarly to the standalone executable (it will ignore the `-in/-i` flag, though). This allows -using the command line to configure accelerator and suffix settings, +using the command-line to configure accelerator and suffix settings, configure screen and logfile output, or to set index style variables -from the command line and more. Here is a correspondingly adapted +from the command-line and more. Here is a correspondingly adapted version of the previous example: .. code-block:: fortran @@ -108,7 +108,7 @@ version of the previous example: CHARACTER(LEN=128), ALLOCATABLE :: command_args(:) INTEGER :: i, argc - ! copy command line flags to `command_args()` + ! copy command-line flags to `command_args()` argc = COMMAND_ARGUMENT_COUNT() ALLOCATE(command_args(0:argc)) DO i=0, argc @@ -321,6 +321,8 @@ of the contents of the :f:mod:`LIBLAMMPS` Fortran interface to LAMMPS. :ftype set_string_variable: subroutine :f set_internal_variable: :f:subr:`set_internal_variable` :ftype set_internal_variable: subroutine + :f eval: :f:func:`eval` + :ftype eval: function :f gather_atoms: :f:subr:`gather_atoms` :ftype gather_atoms: subroutine :f gather_atoms_concat: :f:subr:`gather_atoms_concat` @@ -448,7 +450,7 @@ of the contents of the :f:mod:`LIBLAMMPS` Fortran interface to LAMMPS. compiled with MPI support, it will also initialize MPI, if it has not already been initialized before. - The *args* argument with the list of command line parameters is + The *args* argument with the list of command-line parameters is optional and so it the *comm* argument with the MPI communicator. If *comm* is not provided, ``MPI_COMM_WORLD`` is assumed. For more details please see the documentation of :cpp:func:`lammps_open`. diff --git a/doc/src/Howto.rst b/doc/src/Howto.rst index 5a63e2b1c4..df42f6bd9d 100644 --- a/doc/src/Howto.rst +++ b/doc/src/Howto.rst @@ -103,6 +103,7 @@ Tutorials howto Howto_github Howto_lammps_gui Howto_moltemplate + Howto_python Howto_pylammps Howto_wsl diff --git a/doc/src/Howto_cmake.rst b/doc/src/Howto_cmake.rst index 43aa519293..6b8fc82bad 100644 --- a/doc/src/Howto_cmake.rst +++ b/doc/src/Howto_cmake.rst @@ -56,7 +56,7 @@ using a shell like Bash or Zsh. Visual Studio IDE with the bundled CMake or from the Windows command prompt using a separately installed CMake package, both using the native Microsoft Visual C++ compilers and (optionally) the Microsoft MPI SDK. This tutorial, however, only - covers unix-like command line interfaces. + covers unix-like command-line interfaces. We also assume that you have downloaded and unpacked a recent LAMMPS source code package or used Git to create a clone of the LAMMPS sources on your compilation machine. @@ -277,7 +277,7 @@ Setting options --------------- Options that enable, disable or modify settings are modified by setting -the value of CMake variables. This is done on the command line with the +the value of CMake variables. This is done on the command-line with the *-D* flag in the format ``-D VARIABLE=value``, e.g. ``-D CMAKE_BUILD_TYPE=Release`` or ``-D BUILD_MPI=on``. There is one quirk: when used before the CMake directory, there may be a space between the @@ -376,7 +376,7 @@ Using presets ------------- Since LAMMPS has a lot of optional features and packages, specifying -them all on the command line can be tedious. Or when selecting a +them all on the command-line can be tedious. Or when selecting a different compiler toolchain, multiple options have to be changed consistently and that is rather error prone. Or when enabling certain packages, they require consistent settings to be operated in a @@ -384,7 +384,7 @@ particular mode. For this purpose, we are providing a selection of "preset files" for CMake in the folder ``cmake/presets``. They represent a way to pre-load or override the CMake configuration cache by setting or changing CMake variables. Preset files are loaded using the -*-C* command line flag. You can combine loading multiple preset files or +*-C* command-line flag. You can combine loading multiple preset files or change some variables later with additional *-D* flags. A few examples: .. code-block:: bash diff --git a/doc/src/Howto_github.rst b/doc/src/Howto_github.rst index b81716c09d..78a05f113c 100644 --- a/doc/src/Howto_github.rst +++ b/doc/src/Howto_github.rst @@ -163,7 +163,7 @@ After everything is done, add the files to the branch and commit them: *git rm*, *git mv* for adding, removing, renaming individual files, respectively, and then *git commit* to finalize the commit. Carefully check all pending changes with *git status* before - committing them. If you find doing this on the command line too + committing them. If you find doing this on the command-line too tedious, consider using a GUI, for example the one included in git distributions written in Tk, i.e. use *git gui* (on some Linux distributions it may be required to install an additional package to diff --git a/doc/src/Howto_lammps_gui.rst b/doc/src/Howto_lammps_gui.rst index 21e6a31ccc..63cf859c57 100644 --- a/doc/src/Howto_lammps_gui.rst +++ b/doc/src/Howto_lammps_gui.rst @@ -20,8 +20,11 @@ to the online LAMMPS documentation for known LAMMPS commands and styles. (Ubuntu 20.04LTS or later and compatible), macOS (version 11 aka Big Sur or later), and Windows (version 10 or later) :ref:`are available ` for download. Non-MPI LAMMPS executables (as - ``lmp``) for running LAMMPS from the command line and :doc:`some + ``lmp``) for running LAMMPS from the command-line and :doc:`some LAMMPS tools ` compiled executables are also included. + Also, the pre-compiled LAMMPS-GUI packages include the WHAM executables + from http://membrane.urmc.rochester.edu/content/wham/ for use with + LAMMPS tutorials. The source code for LAMMPS-GUI is included in the LAMMPS source code distribution and can be found in the ``tools/lammps-gui`` folder. It @@ -29,16 +32,16 @@ to the online LAMMPS documentation for known LAMMPS commands and styles. `. LAMMPS-GUI tries to provide an experience similar to what people -traditionally would have running LAMMPS using a command line window and +traditionally would have running LAMMPS using a command-line window and the console LAMMPS executable but just rolled into a single executable: - writing & editing LAMMPS input files with a text editor -- run LAMMPS on those input file with selected command line flags +- run LAMMPS on those input file with selected command-line flags - extract data from the created files and visualize it with and external software That procedure is quite effective for people proficient in using the -command line, as that allows them to use tools for the individual steps +command-line, as that allows them to use tools for the individual steps that they are most comfortable with. In fact, it is often *required* to adopt this workflow when running LAMMPS simulations on high-performance computing facilities. @@ -61,13 +64,18 @@ simple LAMMPS simulations. It is very suitable for tutorials on LAMMPS since you only need to learn how to use a single program for most tasks and thus time can be saved and people can focus on learning LAMMPS. The tutorials at https://lammpstutorials.github.io/ are specifically -updated for use with LAMMPS-GUI. +updated for use with LAMMPS-GUI and can their tutorial materials can +be downloaded and loaded directly from the GUI. Another design goal is to keep the barrier low when replacing part of the functionality of LAMMPS-GUI with external tools. That said, LAMMPS-GUI has some unique functionality that is not found elsewhere: - auto-adapting to features available in the integrated LAMMPS library +- auto-completion for LAMMPS commands and options +- context-sensitive online help +- start and stop of simulations via mouse or keyboard +- monitoring of simulation progress - interactive visualization using the :doc:`dump image ` command with the option to copy-paste the resulting settings - automatic slide show generation from dump image out at runtime @@ -100,10 +108,11 @@ MacOS 11 and later ^^^^^^^^^^^^^^^^^^ After downloading the ``LAMMPS-macOS-multiarch-GUI-.dmg`` -installer package, you need to double-click it and then, in the window -that opens, drag the app bundle as indicated into the "Applications" -folder. The follow the instructions in the "README.txt" file to -get access to the other included executables. +application bundle disk image, you need to double-click it and then, in +the window that opens, drag the app bundle as indicated into the +"Applications" folder. Afterwards, the disk image can be unmounted. +Then follow the instructions in the "README.txt" file to get access to +the other included command-line executables. Linux on x86\_64 ^^^^^^^^^^^^^^^^ @@ -117,15 +126,25 @@ into the "LAMMPS_GUI" folder and execute "./lammps-gui" directly. The second variant uses `flatpak `_ and requires the flatpak management and runtime software to be installed. -After downloading the ``LAMMPS-GUI-Linux-x86_64-GUI-.tar.gz`` +After downloading the ``LAMMPS-GUI-Linux-x86_64-GUI-.flatpak`` flatpak bundle, you can install it with ``flatpak install --user -LAMMPS-GUI-Linux-x86_64-GUI-.tar.gz``. After installation, +LAMMPS-GUI-Linux-x86_64-GUI-.flatpak``. After installation, LAMMPS-GUI should be integrated into your desktop environment under "Applications > Science" but also can be launched from the console with ``flatpak run org.lammps.lammps-gui``. The flatpak bundle also includes the console LAMMPS executable ``lmp`` which can be launched to run -simulations with, for example: ``flatpak run --command=lmp -org.lammps.lammps-gui -in in.melt``. +simulations with, for example with: + +.. code-block:: sh + + flatpak run --command=lmp org.lammps.lammps-gui -in in.melt + +Other bundled command-line executables are run the same way and can be +listed with: + +.. code-block:: sh + + ls $(flatpak info --show-location org.lammps.lammps-gui )/files/bin Compiling from Source @@ -165,9 +184,9 @@ window is stored when exiting and restored when starting again. Opening Files ^^^^^^^^^^^^^ -The LAMMPS-GUI application can be launched without command line arguments +The LAMMPS-GUI application can be launched without command-line arguments and then starts with an empty buffer in the *Editor* window. If arguments -are given LAMMPS will use first command line argument as the file name for +are given LAMMPS will use first command-line argument as the file name for the *Editor* buffer and reads its contents into the buffer, if the file exists. All further arguments are ignored. Files can also be opened via the *File* menu, the `Ctrl-O` (`Command-O` on macOS) keyboard shortcut @@ -261,14 +280,21 @@ Output Window By default, when starting a run, an *Output* window opens that displays the screen output of the running LAMMPS calculation, as shown below. -This text would normally be seen in the command line window. +This text would normally be seen in the command-line window. .. image:: JPG/lammps-gui-log.png :align: center :scale: 50% LAMMPS-GUI captures the screen output from LAMMPS as it is generated and -updates the *Output* window regularly during a run. +updates the *Output* window regularly during a run. If there are any +warnings or errors in the LAMMPS output, they are highlighted by using +bold text colored in red. There is a small panel at the bottom center +of the *Output* window showing how many warnings and errors were +detected and how many lines the entire output has. By clicking on the +button on the right with the warning symbol or by using the keyboard +shortcut `Ctrl-N` (`Command-N` on macOS), you can jump to the next +line with a warning or error. By default, the *Output* window is replaced each time a run is started. The runs are counted and the run number for the current run is displayed @@ -398,7 +424,7 @@ below. Like for the *Output* and *Charts* windows, its content is continuously updated during a run. It will show "(none)" if there are no variables defined. Note that it is also possible to *set* :doc:`index style -variables `, that would normally be set via command line +variables `, that would normally be set via command-line flags, via the "Set Variables..." dialog from the *Run* menu. LAMMPS-GUI automatically defines the variable "gui_run" to the current value of the run counter. That way it is possible to automatically @@ -775,11 +801,11 @@ General Settings: - *Echo input to log:* when checked, all input commands, including variable expansions, are echoed to the *Output* window. This is - equivalent to using `-echo screen` at the command line. There is no + equivalent to using `-echo screen` at the command-line. There is no log *file* produced by default, since LAMMPS-GUI uses `-log none`. - *Include citation details:* when checked full citation info will be included to the log window. This is equivalent to using `-cite - screen` on the command line. + screen` on the command-line. - *Show log window by default:* when checked, the screen output of a LAMMPS run will be collected in a log window during the run - *Show chart window by default:* when checked, the thermodynamic @@ -828,7 +854,7 @@ Accelerators: This tab enables selection of an accelerator package for LAMMPS to use and is equivalent to using the `-suffix` and `-package` flags on the -command line. Only settings supported by the LAMMPS library and local +command-line. Only settings supported by the LAMMPS library and local hardware are available. The `Number of threads` field allows setting the maximum number of threads for the accelerator packages that use threads. diff --git a/doc/src/Howto_peri.rst b/doc/src/Howto_peri.rst index 8a2aa4cab6..29eb685c81 100644 --- a/doc/src/Howto_peri.rst +++ b/doc/src/Howto_peri.rst @@ -738,8 +738,8 @@ command. This can be done, for example, by using the built-in visualizer of the :doc:`dump image or dump movie ` command to create snapshot -images or a movie. Below are example command lines for using dump image -with the :ref:`example listed below ` and a set of images +images or a movie. Below are example command for using dump image with +the :ref:`example listed below ` and a set of images created for steps 300, 600, and 2000 this way. .. code-block:: LAMMPS diff --git a/doc/src/Howto_pylammps.rst b/doc/src/Howto_pylammps.rst index 645434bbab..4a9b985bf6 100644 --- a/doc/src/Howto_pylammps.rst +++ b/doc/src/Howto_pylammps.rst @@ -1,564 +1,6 @@ PyLammps Tutorial ================= -.. contents:: - -Overview --------- - -:py:class:`PyLammps ` is a Python wrapper class for -LAMMPS which can be created on its own or use an existing -:py:class:`lammps Python ` object. It creates a simpler, -more "pythonic" interface to common LAMMPS functionality, in contrast to -the :py:class:`lammps ` wrapper for the LAMMPS :ref:`C -language library interface API ` which is written using -`Python ctypes `_. The :py:class:`lammps ` -wrapper is discussed on the :doc:`Python_head` doc page. - -Unlike the flat `ctypes `_ interface, PyLammps exposes a -discoverable API. It no longer requires knowledge of the underlying C++ -code implementation. Finally, the :py:class:`IPyLammps -` wrapper builds on top of :py:class:`PyLammps -` and adds some additional features for `IPython -integration `_ into `Jupyter notebooks `_, e.g. for -embedded visualization output from :doc:`dump style image `. - -.. _ctypes: https://docs.python.org/3/library/ctypes.html -.. _ipython: https://ipython.org/ -.. _jupyter: https://jupyter.org/ - -Comparison of lammps and PyLammps interfaces -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -lammps.lammps -""""""""""""" - -* uses `ctypes `_ -* direct memory access to native C++ data with optional support for NumPy arrays -* provides functions to send and receive data to LAMMPS -* interface modeled after the LAMMPS :ref:`C language library interface API ` -* requires knowledge of how LAMMPS internally works (C pointers, etc) -* full support for running Python with MPI using `mpi4py `_ -* no overhead from creating a more Python-like interface - -lammps.PyLammps -""""""""""""""" - -* higher-level abstraction built on *top* of the original :py:class:`ctypes based interface ` -* manipulation of Python objects -* communication with LAMMPS is hidden from API user -* shorter, more concise Python -* better IPython integration, designed for quick prototyping -* designed for serial execution -* additional overhead from capturing and parsing the LAMMPS screen output - -Quick Start ------------ - -System-wide Installation -^^^^^^^^^^^^^^^^^^^^^^^^ - -Step 1: Building LAMMPS as a shared library -""""""""""""""""""""""""""""""""""""""""""" - -To use LAMMPS inside of Python it has to be compiled as shared -library. This library is then loaded by the Python interface. In this -example we enable the MOLECULE package and compile LAMMPS with PNG, JPEG -and FFMPEG output support enabled. - -Step 1a: For the CMake based build system, the steps are: - -.. code-block:: bash - - mkdir $LAMMPS_DIR/build-shared - cd $LAMMPS_DIR/build-shared - - # MPI, PNG, Jpeg, FFMPEG are auto-detected - cmake ../cmake -DPKG_MOLECULE=yes -DBUILD_LIB=yes -DBUILD_SHARED_LIBS=yes - make - -Step 1b: For the legacy, make based build system, the steps are: - -.. code-block:: bash - - cd $LAMMPS_DIR/src - - # add packages if necessary - make yes-MOLECULE - - # compile shared library using Makefile - make mpi mode=shlib LMP_INC="-DLAMMPS_PNG -DLAMMPS_JPEG -DLAMMPS_FFMPEG" JPG_LIB="-lpng -ljpeg" - -Step 2: Installing the LAMMPS Python package -"""""""""""""""""""""""""""""""""""""""""""" - -PyLammps is part of the lammps Python package. To install it simply install -that package into your current Python installation with: - -.. code-block:: bash - - make install-python - -.. note:: - - Recompiling the shared library requires re-installing the Python package - -Installation inside of a virtualenv -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -You can use virtualenv to create a custom Python environment specifically tuned -for your workflow. - -Benefits of using a virtualenv -"""""""""""""""""""""""""""""" - -* isolation of your system Python installation from your development installation -* installation can happen in your user directory without root access (useful for HPC clusters) -* installing packages through pip allows you to get newer versions of packages than e.g., through apt-get or yum package managers (and without root access) -* you can even install specific old versions of a package if necessary - -**Prerequisite (e.g. on Ubuntu)** - -.. code-block:: bash - - apt-get install python-virtualenv - -Creating a virtualenv with lammps installed -""""""""""""""""""""""""""""""""""""""""""" - -.. code-block:: bash - - # create virtualenv named 'testing' - virtualenv $HOME/python/testing - - # activate 'testing' environment - source $HOME/python/testing/bin/activate - -Now configure and compile the LAMMPS shared library as outlined above. -When using CMake and the shared library has already been build, you -need to re-run CMake to update the location of the python executable -to the location in the virtual environment with: - -.. code-block:: bash - - cmake . -DPython_EXECUTABLE=$(which python) - - # install LAMMPS package in virtualenv - (testing) make install-python - - # install other useful packages - (testing) pip install matplotlib jupyter mpi4py - - ... - - # return to original shell - (testing) deactivate - -Creating a new instance of PyLammps ------------------------------------ - -To create a PyLammps object you need to first import the class from the lammps -module. By using the default constructor, a new *lammps* instance is created. - -.. code-block:: python - - from lammps import PyLammps - L = PyLammps() - -You can also initialize PyLammps on top of this existing *lammps* object: - -.. code-block:: python - - from lammps import lammps, PyLammps - lmp = lammps() - L = PyLammps(ptr=lmp) - -Commands --------- - -Sending a LAMMPS command with the existing library interfaces is done using -the command method of the lammps object instance. - -For instance, let's take the following LAMMPS command: - -.. code-block:: LAMMPS - - region box block 0 10 0 5 -0.5 0.5 - -In the original interface this command can be executed with the following -Python code if *L* was a lammps instance: - -.. code-block:: python - - L.command("region box block 0 10 0 5 -0.5 0.5") - -With the PyLammps interface, any command can be split up into arbitrary parts -separated by white-space, passed as individual arguments to a region method. - -.. code-block:: python - - L.region("box block", 0, 10, 0, 5, -0.5, 0.5) - -Note that each parameter is set as Python literal floating-point number. In the -PyLammps interface, each command takes an arbitrary parameter list and transparently -merges it to a single command string, separating individual parameters by white-space. - -The benefit of this approach is avoiding redundant command calls and easier -parameterization. In the original interface parameterization needed to be done -manually by creating formatted strings. - -.. code-block:: python - - L.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) - -In contrast, methods of PyLammps accept parameters directly and will convert -them automatically to a final command string. - -.. code-block:: python - - L.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) - -System state ------------- - -In addition to dispatching commands directly through the PyLammps object, it -also provides several properties which allow you to query the system state. - -L.system - Is a dictionary describing the system such as the bounding box or number of atoms - -L.system.xlo, L.system.xhi - bounding box limits along x-axis - -L.system.ylo, L.system.yhi - bounding box limits along y-axis - -L.system.zlo, L.system.zhi - bounding box limits along z-axis - -L.communication - configuration of communication subsystem, such as the number of threads or processors - -L.communication.nthreads - number of threads used by each LAMMPS process - -L.communication.nprocs - number of MPI processes used by LAMMPS - -L.fixes - List of fixes in the current system - -L.computes - List of active computes in the current system - -L.dump - List of active dumps in the current system - -L.groups - List of groups present in the current system - -Working with LAMMPS variables ------------------------------ - -LAMMPS variables can be both defined and accessed via the PyLammps interface. - -To define a variable you can use the :doc:`variable ` command: - -.. code-block:: python - - L.variable("a index 2") - -A dictionary of all variables is returned by L.variables - -you can access an individual variable by retrieving a variable object from the -L.variables dictionary by name - -.. code-block:: python - - a = L.variables['a'] - -The variable value can then be easily read and written by accessing the value -property of this object. - -.. code-block:: python - - print(a.value) - a.value = 4 - -Retrieving the value of an arbitrary LAMMPS expressions -------------------------------------------------------- - -LAMMPS expressions can be immediately evaluated by using the eval method. The -passed string parameter can be any expression containing global thermo values, -variables, compute or fix data. - -.. code-block:: python - - result = L.eval("ke") # kinetic energy - result = L.eval("pe") # potential energy - - result = L.eval("v_t/2.0") - -Accessing atom data -------------------- - -All atoms in the current simulation can be accessed by using the L.atoms list. -Each element of this list is an object which exposes its properties (id, type, -position, velocity, force, etc.). - -.. code-block:: python - - # access first atom - L.atoms[0].id - L.atoms[0].type - - # access second atom - L.atoms[1].position - L.atoms[1].velocity - L.atoms[1].force - -Some properties can also be used to set: - -.. code-block:: python - - # set position in 2D simulation - L.atoms[0].position = (1.0, 0.0) - - # set position in 3D simulation - L.atoms[0].position = (1.0, 0.0, 1.) - -Evaluating thermo data ----------------------- - -Each simulation run usually produces thermo output based on system state, -computes, fixes or variables. The trajectories of these values can be queried -after a run via the L.runs list. This list contains a growing list of run data. -The first element is the output of the first run, the second element that of -the second run. - -.. code-block:: python - - L.run(1000) - L.runs[0] # data of first 1000 time steps - - L.run(1000) - L.runs[1] # data of second 1000 time steps - -Each run contains a dictionary of all trajectories. Each trajectory is -accessible through its thermo name: - -.. code-block:: python - - L.runs[0].thermo.Step # list of time steps in first run - L.runs[0].thermo.Ke # list of kinetic energy values in first run - -Together with matplotlib plotting data out of LAMMPS becomes simple: - -.. code-block:: python - - import matplotlib.plot as plt - steps = L.runs[0].thermo.Step - ke = L.runs[0].thermo.Ke - plt.plot(steps, ke) - -Error handling with PyLammps ----------------------------- - -Using C++ exceptions in LAMMPS for errors allows capturing them on the -C++ side and rethrowing them on the Python side. This way you can handle -LAMMPS errors through the Python exception handling mechanism. - -.. warning:: - - Capturing a LAMMPS exception in Python can still mean that the - current LAMMPS process is in an illegal state and must be - terminated. It is advised to save your data and terminate the Python - instance as quickly as possible. - -Using PyLammps in IPython notebooks and Jupyter ------------------------------------------------ - -If the LAMMPS Python package is installed for the same Python interpreter as -IPython, you can use PyLammps directly inside of an IPython notebook inside of -Jupyter. Jupyter is a powerful integrated development environment (IDE) for -many dynamic languages like Python, Julia and others, which operates inside of -any web browser. Besides auto-completion and syntax highlighting it allows you -to create formatted documents using Markup, mathematical formulas, graphics and -animations intermixed with executable Python code. It is a great format for -tutorials and showcasing your latest research. - -To launch an instance of Jupyter simply run the following command inside your -Python environment (this assumes you followed the Quick Start instructions): - -.. code-block:: bash - - jupyter notebook - -IPyLammps Examples ------------------- - -Examples of IPython notebooks can be found in the python/examples/pylammps -subdirectory. To open these notebooks launch *jupyter notebook* inside this -directory and navigate to one of them. If you compiled and installed -a LAMMPS shared library with exceptions, PNG, JPEG and FFMPEG support -you should be able to rerun all of these notebooks. - -Validating a dihedral potential -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This example showcases how an IPython Notebook can be used to compare a simple -LAMMPS simulation of a harmonic dihedral potential to its analytical solution. -Four atoms are placed in the simulation and the dihedral potential is applied on -them using a datafile. Then one of the atoms is rotated along the central axis by -setting its position from Python, which changes the dihedral angle. - -.. code-block:: python - - phi = [d \* math.pi / 180 for d in range(360)] - - pos = [(1.0, math.cos(p), math.sin(p)) for p in phi] - - pe = [] - for p in pos: - L.atoms[3].position = p - L.run(0) - pe.append(L.eval("pe")) - -By evaluating the potential energy for each position we can verify that -trajectory with the analytical formula. To compare both solutions, we plot -both trajectories over each other using matplotlib, which embeds the generated -plot inside the IPython notebook. - -.. image:: JPG/pylammps_dihedral.jpg - :align: center - -Running a Monte Carlo relaxation -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ - -This second example shows how to use PyLammps to create a 2D Monte Carlo Relaxation -simulation, computing and plotting energy terms and even embedding video output. - -Initially, a 2D system is created in a state with minimal energy. - -.. image:: JPG/pylammps_mc_minimum.jpg - :align: center - -It is then disordered by moving each atom by a random delta. - -.. code-block:: python - - random.seed(27848) - deltaperturb = 0.2 - - for i in range(L.system.natoms): - x, y = L.atoms[i].position - dx = deltaperturb \* random.uniform(-1, 1) - dy = deltaperturb \* random.uniform(-1, 1) - L.atoms[i].position = (x+dx, y+dy) - - L.run(0) - -.. image:: JPG/pylammps_mc_disordered.jpg - :align: center - -Finally, the Monte Carlo algorithm is implemented in Python. It continuously -moves random atoms by a random delta and only accepts certain moves. - -.. code-block:: python - - estart = L.eval("pe") - elast = estart - - naccept = 0 - energies = [estart] - - niterations = 3000 - deltamove = 0.1 - kT = 0.05 - - natoms = L.system.natoms - - for i in range(niterations): - iatom = random.randrange(0, natoms) - current_atom = L.atoms[iatom] - - x0, y0 = current_atom.position - - dx = deltamove \* random.uniform(-1, 1) - dy = deltamove \* random.uniform(-1, 1) - - current_atom.position = (x0+dx, y0+dy) - - L.run(1, "pre no post no") - - e = L.eval("pe") - energies.append(e) - - if e <= elast: - naccept += 1 - elast = e - elif random.random() <= math.exp(natoms\*(elast-e)/kT): - naccept += 1 - elast = e - else: - current_atom.position = (x0, y0) - -The energies of each iteration are collected in a Python list and finally plotted using matplotlib. - -.. image:: JPG/pylammps_mc_energies_plot.jpg - :align: center - -The IPython notebook also shows how to use dump commands and embed video files -inside of the IPython notebook. - -Using PyLammps and mpi4py (Experimental) ----------------------------------------- - -PyLammps can be run in parallel using `mpi4py -`_. This python package can be installed -using - -.. code-block:: bash - - pip install mpi4py - -.. warning:: - - Usually, any :py:class:`PyLammps ` command must be - executed by *all* MPI processes. However, evaluations and querying - the system state is only available on MPI rank 0. Using these - functions from other MPI ranks will raise an exception. - -The following is a short example which reads in an existing LAMMPS input -file and executes it in parallel. You can find in.melt in the -examples/melt folder. Please take note that the -:py:meth:`PyLammps.eval() ` is called only from -MPI rank 0. - -.. code-block:: python - - from mpi4py import MPI - from lammps import PyLammps - - L = PyLammps() - L.file("in.melt") - - if MPI.COMM_WORLD.rank == 0: - print("Potential energy: ", L.eval("pe")) - - MPI.Finalize() - -To run this script (melt.py) in parallel using 4 MPI processes we invoke the -following mpirun command: - -.. code-block:: bash - - mpirun -np 4 python melt.py - -Feedback and Contributing -------------------------- - -If you find this Python interface useful, please feel free to provide feedback -and ideas on how to improve it to Richard Berger (richard.berger@outlook.com). We also -want to encourage people to write tutorial style IPython notebooks showcasing LAMMPS usage -and maybe their latest research results. +The PyLammps interface is deprecated and will be removed in a future release of +LAMMPS. As such, the PyLammps version of this tutorial has been removed and is +replaced by the :doc:`Python_head`. diff --git a/doc/src/Howto_python.rst b/doc/src/Howto_python.rst new file mode 100644 index 0000000000..bfb182d989 --- /dev/null +++ b/doc/src/Howto_python.rst @@ -0,0 +1,441 @@ +LAMMPS Python Tutorial +====================== + +.. contents:: + +----- + +Overview +-------- + +The :py:class:`lammps ` Python module is a wrapper class for the +LAMMPS :ref:`C language library interface API ` which is written using +`Python ctypes `_. The design choice of this wrapper class is to +follow the C language API closely with only small changes related to Python +specific requirements and to better accommodate object oriented programming. + +In addition to this flat `ctypes `_ interface, the +:py:class:`lammps ` wrapper class exposes a discoverable +API that doesn't require as much knowledge of the underlying C language +library interface or LAMMPS C++ code implementation. + +Finally, the API exposes some additional features for `IPython integration +`_ into `Jupyter notebooks `_, e.g. for embedded +visualization output from :doc:`dump style image `. + +.. _ctypes: https://docs.python.org/3/library/ctypes.html +.. _ipython: https://ipython.org/ +.. _jupyter: https://jupyter.org/ + +----- + +Quick Start +----------- + +System-wide or User Installation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Step 1: Building LAMMPS as a shared library +""""""""""""""""""""""""""""""""""""""""""" + +To use LAMMPS inside of Python it has to be compiled as shared library. +This library is then loaded by the Python interface. In this example we +enable the :ref:`MOLECULE package ` and compile LAMMPS +with :ref:`PNG, JPEG and FFMPEG output support ` enabled. + +.. tabs:: + + .. tab:: CMake build + + .. code-block:: bash + + mkdir $LAMMPS_DIR/build-shared + cd $LAMMPS_DIR/build-shared + + # MPI, PNG, Jpeg, FFMPEG are auto-detected + cmake ../cmake -DPKG_MOLECULE=yes -DPKG_PYTHON=on -DBUILD_SHARED_LIBS=yes + make + + .. tab:: Traditional make + + .. code-block:: bash + + cd $LAMMPS_DIR/src + + # add packages if necessary + make yes-MOLECULE + make yes-PYTHON + + # compile shared library using Makefile + make mpi mode=shlib LMP_INC="-DLAMMPS_PNG -DLAMMPS_JPEG -DLAMMPS_FFMPEG" JPG_LIB="-lpng -ljpeg" + +Step 2: Installing the LAMMPS Python package +"""""""""""""""""""""""""""""""""""""""""""" + +Next install the LAMMPS Python package into your current Python installation with: + +.. code-block:: bash + + make install-python + +This will create a so-called `"wheel" +`_ +and then install the LAMMPS Python module from that "wheel" into either +into a system folder (provided the command is executed with root +privileges) or into your personal Python module folder. + +.. note:: + + Recompiling the shared library requires re-installing the Python + package. + +Installation inside of a virtual environment +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +You can use virtual environments to create a custom Python environment +specifically tuned for your workflow. + +Benefits of using a virtualenv +"""""""""""""""""""""""""""""" + +* isolation of your system Python installation from your development installation +* installation can happen in your user directory without root access (useful for HPC clusters) +* installing packages through pip allows you to get newer versions of packages than e.g., through apt-get or yum package managers (and without root access) +* you can even install specific old versions of a package if necessary + +**Prerequisite (e.g. on Ubuntu)** + +.. code-block:: bash + + apt-get install python-venv + +Creating a virtualenv with lammps installed +""""""""""""""""""""""""""""""""""""""""""" + +.. code-block:: bash + + # create virtual envrionment named 'testing' + python3 -m venv $HOME/python/testing + + # activate 'testing' environment + source $HOME/python/testing/bin/activate + +Now configure and compile the LAMMPS shared library as outlined above. +When using CMake and the shared library has already been build, you +need to re-run CMake to update the location of the python executable +to the location in the virtual environment with: + +.. code-block:: bash + + cmake . -DPython_EXECUTABLE=$(which python) + + # install LAMMPS package in virtualenv + (testing) make install-python + + # install other useful packages + (testing) pip install matplotlib jupyter mpi4py pandas + + ... + + # return to original shell + (testing) deactivate + +------- + +Creating a new lammps instance +------------------------------ + +To create a lammps object you need to first import the class from the lammps +module. By using the default constructor, a new :py:class:`lammps +` instance is created. + +.. code-block:: python + + from lammps import lammps + L = lammps() + +See the :doc:`LAMMPS Python documentation ` for how to customize +the instance creation with optional arguments. + +----- + +Commands +-------- + +Sending a LAMMPS command with the library interface is done using +the ``command`` method of the lammps object. + +For instance, let's take the following LAMMPS command: + +.. code-block:: LAMMPS + + region box block 0 10 0 5 -0.5 0.5 + +This command can be executed with the following Python code if ``L`` is a ``lammps`` +instance: + +.. code-block:: python + + L.command("region box block 0 10 0 5 -0.5 0.5") + +For convenience, the ``lammps`` class also provides a command wrapper ``cmd`` +that turns any LAMMPS command into a regular function call: + +.. code-block:: python + + L.cmd.region("box block", 0, 10, 0, 5, -0.5, 0.5) + +Note that each parameter is set as Python number literal. With +the wrapper each command takes an arbitrary parameter list and transparently +merges it to a single command string, separating individual parameters by +white-space. + +The benefit of this approach is avoiding redundant command calls and easier +parameterization. With the ``command`` function each call needs to be assembled +manually using formatted strings. + +.. code-block:: python + + L.command(f"region box block {xlo} {xhi} {ylo} {yhi} {zlo} {zhi}") + +The wrapper accepts parameters directly and will convert +them automatically to a final command string. + +.. code-block:: python + + L.cmd.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) + +.. note:: + + When running in IPython you can use Tab-completion after ``L.cmd.`` to see + all available LAMMPS commands. + +----- + +Accessing atom data +------------------- + +All per-atom properties that are part of the :doc:`atom style +` in the current simulation can be accessed using the +:py:meth:`extract_atoms() ` method. This +can be retrieved as ctypes objects or as NumPy arrays through the +lammps.numpy module. Those represent the *local* atoms of the +individual sub-domain for the current MPI process and may contain +information for the local ghost atoms or not depending on the property. +Both can be accessed as lists, but for the ctypes list object the size +is not known and hast to be retrieved first to avoid out-of-bounds +accesses. + +.. code-block:: python + + nlocal = L.extract_setting("nlocal") + nall = L.extract_setting("nall") + print("Number of local atoms ", nlocal, " Number of local and ghost atoms ", nall); + + # access via ctypes directly + atom_id = L.extract_atom("id") + print("Atom IDs", atom_id[0:nlocal]) + + # access through numpy wrapper + atom_type = L.numpy.extract_atom("type") + print("Atom types", atom_type) + + x = L.numpy.extract_atom("x") + v = L.numpy.extract_atom("v") + print("positions array shape", x.shape) + print("velocity array shape", v.shape) + # turn on communicating velocities to ghost atoms + L.cmd.comm_modify("vel", "yes") + v = L.numpy.extract_atom('v') + print("velocity array shape", v.shape) + +Some properties can also be set from Python since internally the +data of the C++ code is accessed directly: + +.. code-block:: python + + # set position in 2D simulation + x[0] = (1.0, 0.0) + + # set position in 3D simulation + x[0] = (1.0, 0.0, 1.) + +------ + +Retrieving the values of thermodynamic data and variables +--------------------------------------------------------- + +To access thermodynamic data from the last completed timestep, +you can use the :py:meth:`get_thermo() ` +method, and to extract the value of (compatible) variables, you +can use the :py:meth:`extract_variable() ` +method. + +.. code-block:: python + + result = L.get_thermo("ke") # kinetic energy + result = L.get_thermo("pe") # potential energy + + result = L.extract_variable("t") / 2.0 + +Error handling +-------------- + +We are using C++ exceptions in LAMMPS for errors and the C language +library interface captures and records them. This allows checking +whether errors have happened in Python during a call into LAMMPS and +then re-throw the error as a Python exception. This way you can handle +LAMMPS errors in the conventional way through the Python exception +handling mechanism. + +.. warning:: + + Capturing a LAMMPS exception in Python can still mean that the + current LAMMPS process is in an illegal state and must be + terminated. It is advised to save your data and terminate the Python + instance as quickly as possible. + +Using LAMMPS in IPython notebooks and Jupyter +--------------------------------------------- + +If the LAMMPS Python package is installed for the same Python +interpreter as IPython, you can use LAMMPS directly inside of an IPython +notebook inside of Jupyter. Jupyter is a powerful integrated development +environment (IDE) for many dynamic languages like Python, Julia and +others, which operates inside of any web browser. Besides +auto-completion and syntax highlighting it allows you to create +formatted documents using Markup, mathematical formulas, graphics and +animations intermixed with executable Python code. It is a great format +for tutorials and showcasing your latest research. + +To launch an instance of Jupyter simply run the following command inside your +Python environment (this assumes you followed the Quick Start instructions): + +.. code-block:: bash + + jupyter notebook + +Interactive Python Examples +--------------------------- + +Examples of IPython notebooks can be found in the ``python/examples/ipython`` +subdirectory. To open these notebooks launch ``jupyter notebook`` inside this +directory and navigate to one of them. If you compiled and installed +a LAMMPS shared library with PNG, JPEG and FFMPEG support +you should be able to rerun all of these notebooks. + +Validating a dihedral potential +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This example showcases how an IPython Notebook can be used to compare a simple +LAMMPS simulation of a harmonic dihedral potential to its analytical solution. +Four atoms are placed in the simulation and the dihedral potential is applied on +them using a datafile. Then one of the atoms is rotated along the central axis by +setting its position from Python, which changes the dihedral angle. + +.. code-block:: python + + phi = [d \* math.pi / 180 for d in range(360)] + + pos = [(1.0, math.cos(p), math.sin(p)) for p in phi] + + x = L.numpy.extract_atom("x") + + pe = [] + for p in pos: + x[3] = p + L.cmd.run(0, "post", "no") + pe.append(L.get_thermo("pe")) + +By evaluating the potential energy for each position we can verify that +trajectory with the analytical formula. To compare both solutions, we plot +both trajectories over each other using matplotlib, which embeds the generated +plot inside the IPython notebook. + +.. image:: JPG/pylammps_dihedral.jpg + :align: center + +Running a Monte Carlo relaxation +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This second example shows how to use the `lammps` Python interface to create a +2D Monte Carlo Relaxation simulation, computing and plotting energy terms and +even embedding video output. + +Initially, a 2D system is created in a state with minimal energy. + +.. image:: JPG/pylammps_mc_minimum.jpg + :align: center + +It is then disordered by moving each atom by a random delta. + +.. code-block:: python + + random.seed(27848) + deltaperturb = 0.2 + x = L.numpy.extract_atom("x") + natoms = x.shape[0] + + for i in range(natoms): + dx = deltaperturb \* random.uniform(-1, 1) + dy = deltaperturb \* random.uniform(-1, 1) + x[i][0] += dx + x[i][1] += dy + + L.cmd.run(0, "post", "no") + +.. image:: JPG/pylammps_mc_disordered.jpg + :align: center + +Finally, the Monte Carlo algorithm is implemented in Python. It continuously +moves random atoms by a random delta and only accepts certain moves. + +.. code-block:: python + + estart = L.get_thermo("pe") + elast = estart + + naccept = 0 + energies = [estart] + + niterations = 3000 + deltamove = 0.1 + kT = 0.05 + + for i in range(niterations): + x = L.numpy.extract_atom("x") + natoms = x.shape[0] + iatom = random.randrange(0, natoms) + current_atom = x[iatom] + + x0 = current_atom[0] + y0 = current_atom[1] + + dx = deltamove \* random.uniform(-1, 1) + dy = deltamove \* random.uniform(-1, 1) + + current_atom[0] = x0 + dx + current_atom[1] = y0 + dy + + L.cmd.run(1, "pre no post no") + + e = L.get_thermo("pe") + energies.append(e) + + if e <= elast: + naccept += 1 + elast = e + elif random.random() <= math.exp(natoms\*(elast-e)/kT): + naccept += 1 + elast = e + else: + current_atom[0] = x0 + current_atom[1] = y0 + +The energies of each iteration are collected in a Python list and finally plotted using matplotlib. + +.. image:: JPG/pylammps_mc_energies_plot.jpg + :align: center + +The IPython notebook also shows how to use dump commands and embed video files +inside of the IPython notebook. diff --git a/doc/src/Howto_wsl.rst b/doc/src/Howto_wsl.rst index fbb75d9cfd..30e4835e4c 100644 --- a/doc/src/Howto_wsl.rst +++ b/doc/src/Howto_wsl.rst @@ -260,7 +260,7 @@ Switch into the :code:`examples/melt` folder: cd ../examples/melt -To run this example in serial, use the following command line: +To run this example in serial, use the following command: .. code-block:: diff --git a/doc/src/Install_git.rst b/doc/src/Install_git.rst index 35b6429f0c..5108009a73 100644 --- a/doc/src/Install_git.rst +++ b/doc/src/Install_git.rst @@ -60,7 +60,7 @@ between them at any time using "git checkout ".) files (mostly by accident). If you do not need access to the entire commit history (most people don't), you can speed up the "cloning" process and reduce local disk space requirements by using the - ``--depth`` git command line flag. That will create a "shallow clone" + ``--depth`` git command-line flag. That will create a "shallow clone" of the repository, which contains only a subset of the git history. Using a depth of 1000 is usually sufficient to include the head commits of the *develop*, the *release*, and the *maintenance* diff --git a/doc/src/JPG/lammps-gui-log.png b/doc/src/JPG/lammps-gui-log.png index 894af371f0..61ab2e0601 100644 Binary files a/doc/src/JPG/lammps-gui-log.png and b/doc/src/JPG/lammps-gui-log.png differ diff --git a/doc/src/Library.rst b/doc/src/Library.rst index 50c28b7fcd..e2021187c2 100644 --- a/doc/src/Library.rst +++ b/doc/src/Library.rst @@ -131,16 +131,15 @@ run LAMMPS in serial mode. .. _lammps_python_api: -LAMMPS Python APIs -================== +LAMMPS Python API +================= The LAMMPS Python module enables calling the LAMMPS C library API from Python by dynamically loading functions in the LAMMPS shared library through the `Python ctypes module `_. Because of the dynamic loading, it is **required** that LAMMPS is compiled in :ref:`"shared" mode `. The Python interface is object-oriented, but -otherwise tries to be very similar to the C library API. Three different -Python classes to run LAMMPS are available and they build on each other. +otherwise tries to be very similar to the C library API. More information on this is in the :doc:`Python_head` section of the manual. Use of the LAMMPS Python module is described in :doc:`Python_module`. diff --git a/doc/src/Library_execute.rst b/doc/src/Library_execute.rst index 44b601ba4c..8560fbb148 100644 --- a/doc/src/Library_execute.rst +++ b/doc/src/Library_execute.rst @@ -7,6 +7,7 @@ This section documents the following functions: - :cpp:func:`lammps_command` - :cpp:func:`lammps_commands_list` - :cpp:func:`lammps_commands_string` +- :cpp:func:`lammps_expand` -------------------- @@ -79,3 +80,8 @@ Below is a short example using some of these functions. .. doxygenfunction:: lammps_commands_string :project: progguide +----------------------- + +.. doxygenfunction:: lammps_expand + :project: progguide + diff --git a/doc/src/Library_objects.rst b/doc/src/Library_objects.rst index 7c0ca824d7..53edfce9e6 100644 --- a/doc/src/Library_objects.rst +++ b/doc/src/Library_objects.rst @@ -1,5 +1,5 @@ -Compute, fixes, variables -========================= +Computes, fixes, variables +========================== This section documents accessing or modifying data stored by computes, fixes, or variables in LAMMPS using the following functions: @@ -12,6 +12,7 @@ fixes, or variables in LAMMPS using the following functions: - :cpp:func:`lammps_set_string_variable` - :cpp:func:`lammps_set_internal_variable` - :cpp:func:`lammps_variable_info` +- :cpp:func:`lammps_eval` ----------------------- @@ -55,6 +56,11 @@ fixes, or variables in LAMMPS using the following functions: ----------------------- +.. doxygenfunction:: lammps_eval + :project: progguide + +----------------------- + .. doxygenenum:: _LMP_DATATYPE_CONST .. doxygenenum:: _LMP_STYLE_CONST diff --git a/doc/src/Modify_requirements.rst b/doc/src/Modify_requirements.rst index cbcb3eca13..c3e514a423 100644 --- a/doc/src/Modify_requirements.rst +++ b/doc/src/Modify_requirements.rst @@ -208,20 +208,21 @@ Build system (strict) LAMMPS currently supports two build systems: one that is based on :doc:`traditional Makefiles ` and one that is based on -:doc:`CMake `. Therefore, your contribution must be -compatible with and support both build systems. +:doc:`CMake `. As of fall 2024, it is no longer required +to support the traditional make build system. New packages may choose +to only support building with CMake. Additions to existing packages +must follow the requirements set by that package. For a single pair of header and implementation files that are an independent feature, it is usually only required to add them to ``src/.gitignore``. For traditional make, if your contributed files or package depend on -other LAMMPS style files or packages also being installed -(e.g. because your file is a derived class from the other LAMMPS -class), then an ``Install.sh`` file is also needed to check for those -dependencies and modifications to ``src/Depend.sh`` to trigger the checks. -See other README and Install.sh files in other directories as -examples. +other LAMMPS style files or packages also being installed (e.g. because +your file is a derived class from the other LAMMPS class), then an +``Install.sh`` file is also needed to check for those dependencies and +modifications to ``src/Depend.sh`` to trigger the checks. See other +README and Install.sh files in other directories as examples. Similarly, for CMake support, changes may need to be made to ``cmake/CMakeLists.txt``, some of the files in ``cmake/presets``, and diff --git a/doc/src/Modify_style.rst b/doc/src/Modify_style.rst index 496415237c..69b7cc9350 100644 --- a/doc/src/Modify_style.rst +++ b/doc/src/Modify_style.rst @@ -46,7 +46,7 @@ Include files (varied) but instead should be initialized either in the initializer list of the constructor or explicitly assigned in the body of the constructor. If the member variable is relevant to the functionality of a class - (for example when it stores a value from a command line argument), the + (for example when it stores a value from a command-line argument), the member variable declaration is followed by a brief comment explaining its purpose and what its values can be. Class members that are pointers should always be initialized to ``nullptr`` in the diff --git a/doc/src/Packages_details.rst b/doc/src/Packages_details.rst index dd6536c15c..baefee0185 100644 --- a/doc/src/Packages_details.rst +++ b/doc/src/Packages_details.rst @@ -2172,8 +2172,8 @@ the :doc:`Build extras ` page. * ``src/OPENMP/README`` * :doc:`Accelerator packages ` * :doc:`OPENMP package ` -* :doc:`Command line option -suffix/-sf omp ` -* :doc:`Command line option -package/-pk omp ` +* :doc:`Command-line option -suffix/-sf omp ` +* :doc:`Command-line option -package/-pk omp ` * :doc:`package omp ` * Search the :doc:`commands ` pages (:doc:`fix `, :doc:`compute `, :doc:`pair `, :doc:`bond, angle, dihedral, improper `, diff --git a/doc/src/Python_atoms.rst b/doc/src/Python_atoms.rst index f01559a524..2d07cc2326 100644 --- a/doc/src/Python_atoms.rst +++ b/doc/src/Python_atoms.rst @@ -2,14 +2,8 @@ Per-atom properties =================== Similar to what is described in :doc:`Library_atoms`, the instances of -:py:class:`lammps `, :py:class:`PyLammps `, or -:py:class:`IPyLammps ` can be used to extract atom quantities -and modify some of them. The main difference between the interfaces is how the information -is exposed. - -While the :py:class:`lammps ` is just a thin layer that wraps C API calls, -:py:class:`PyLammps ` and :py:class:`IPyLammps ` expose -information as objects and properties. +:py:class:`lammps ` can be used to extract atom quantities +and modify some of them. In some cases the data returned is a direct reference to the original data inside LAMMPS cast to ``ctypes`` pointers. Where possible, the wrappers will @@ -25,57 +19,41 @@ against invalid accesses. accordingly. These arrays can change sizes and order at every neighbor list rebuild and atom sort event as atoms are migrating between subdomains. -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + lmp = lammps() + lmp.file("in.sysinit") - from lammps import lammps - lmp = lammps() - lmp.file("in.sysinit") + # Read/Write access via ctypes + nlocal = lmp.extract_global("nlocal") + x = lmp.extract_atom("x") - nlocal = lmp.extract_global("nlocal") - x = lmp.extract_atom("x") + for i in range(nlocal): + print("(x,y,z) = (", x[i][0], x[i][1], x[i][2], ")") - for i in range(nlocal): - print("(x,y,z) = (", x[i][0], x[i][1], x[i][2], ")") + # Read/Write access via NumPy arrays + atom_id = L.numpy.extract_atom("id") + atom_type = L.numpy.extract_atom("type") + x = L.numpy.extract_atom("x") + v = L.numpy.extract_atom("v") + f = L.numpy.extract_atom("f") - lmp.close() + # set position in 2D simulation + x[0] = (1.0, 0.0) - **Methods**: + # set position in 3D simulation + x[0] = (1.0, 0.0, 1.) - * :py:meth:`extract_atom() `: extract a per-atom quantity + lmp.close() - **Numpy Methods**: - * :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array +**Methods**: - .. tab:: PyLammps/IPyLammps API +* :py:meth:`extract_atom() `: extract a per-atom quantity - All atoms in the current simulation can be accessed by using the :py:attr:`PyLammps.atoms ` property. - Each element of this list is a :py:class:`Atom ` or :py:class:`Atom2D ` object. The attributes of - these objects provide access to their data (id, type, position, velocity, force, etc.): - - .. code-block:: python - - # access first atom - L.atoms[0].id - L.atoms[0].type - - # access second atom - L.atoms[1].position - L.atoms[1].velocity - L.atoms[1].force - - Some attributes can be changed: - - .. code-block:: python - - # set position in 2D simulation - L.atoms[0].position = (1.0, 0.0) - - # set position in 3D simulation - L.atoms[0].position = (1.0, 0.0, 1.0) +**Numpy Methods**: +* :py:meth:`numpy.extract_atom() `: extract a per-atom quantity as numpy array diff --git a/doc/src/Python_create.rst b/doc/src/Python_create.rst index 939aad2f32..9301829ea9 100644 --- a/doc/src/Python_create.rst +++ b/doc/src/Python_create.rst @@ -6,11 +6,10 @@ Creating or deleting a LAMMPS object ==================================== With the Python interface the creation of a :cpp:class:`LAMMPS -` instance is included in the constructors for the -:py:class:`lammps `, :py:class:`PyLammps `, -and :py:class:`IPyLammps ` classes. -Internally it will call either :cpp:func:`lammps_open` or :cpp:func:`lammps_open_no_mpi` from the C -library API to create the class instance. +` instance is included in the constructor for the +:py:class:`lammps ` class. Internally it will call either +:cpp:func:`lammps_open` or :cpp:func:`lammps_open_no_mpi` from the C library +API to create the class instance. All arguments are optional. The *name* argument allows loading a LAMMPS shared library that is named ``liblammps_machine.so`` instead of @@ -26,108 +25,25 @@ to run the Python module like the library interface on a subset of the MPI ranks after splitting the communicator. -Here are simple examples using all three Python interfaces: +Here is a simple example using the LAMMPS Python interface: -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + # NOTE: argv[0] is set by the lammps class constructor + args = ["-log", "none"] - from lammps import lammps + # create LAMMPS instance + lmp = lammps(cmdargs=args) - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] + # get and print numerical version code + print("LAMMPS Version: ", lmp.version()) - # create LAMMPS instance - lmp = lammps(cmdargs=args) + # explicitly close and delete LAMMPS instance (optional) + lmp.close() - # get and print numerical version code - print("LAMMPS Version: ", lmp.version()) - - # explicitly close and delete LAMMPS instance (optional) - lmp.close() - - .. tab:: PyLammps API - - The :py:class:`PyLammps ` class is a wrapper around the - :py:class:`lammps ` class and all of its lower level functions. - By default, it will create a new instance of :py:class:`lammps ` passing - along all arguments to the constructor of :py:class:`lammps `. - - .. code-block:: python - - from lammps import PyLammps - - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] - - # create LAMMPS instance - L = PyLammps(cmdargs=args) - - # get and print numerical version code - print("LAMMPS Version: ", L.version()) - - # explicitly close and delete LAMMPS instance (optional) - L.close() - - :py:class:`PyLammps ` objects can also be created on top of an existing - :py:class:`lammps ` object: - - .. code-block:: python - - from lammps import lammps, PyLammps - ... - # create LAMMPS instance - lmp = lammps(cmdargs=args) - - # create PyLammps instance using previously created LAMMPS instance - L = PyLammps(ptr=lmp) - - This is useful if you have to create the :py:class:`lammps ` - instance is a specific way, but want to take advantage of the - :py:class:`PyLammps ` interface. - - .. tab:: IPyLammps API - - The :py:class:`IPyLammps ` class is an extension of the - :py:class:`PyLammps ` class. It has the same construction behavior. By - default, it will create a new instance of :py:class:`lammps` passing - along all arguments to the constructor of :py:class:`lammps`. - - .. code-block:: python - - from lammps import IPyLammps - - # NOTE: argv[0] is set by the lammps class constructor - args = ["-log", "none"] - - # create LAMMPS instance - L = IPyLammps(cmdargs=args) - - # get and print numerical version code - print("LAMMPS Version: ", L.version()) - - # explicitly close and delete LAMMPS instance (optional) - L.close() - - You can also initialize IPyLammps on top of an existing :py:class:`lammps` or :py:class:`PyLammps` object: - - .. code-block:: python - - from lammps import lammps, IPyLammps - ... - # create LAMMPS instance - lmp = lammps(cmdargs=args) - - # create PyLammps instance using previously created LAMMPS instance - L = PyLammps(ptr=lmp) - - This is useful if you have to create the :py:class:`lammps ` - instance is a specific way, but want to take advantage of the - :py:class:`IPyLammps ` interface. - -In all of the above cases, same as with the :ref:`C library API `, this will use the +Same as with the :ref:`C library API `, this will use the ``MPI_COMM_WORLD`` communicator for the MPI library that LAMMPS was compiled with. diff --git a/doc/src/Python_execute.rst b/doc/src/Python_execute.rst index 01cf0e920f..a9d65133db 100644 --- a/doc/src/Python_execute.rst +++ b/doc/src/Python_execute.rst @@ -1,127 +1,123 @@ Executing commands ================== -Once an instance of the :py:class:`lammps `, -:py:class:`PyLammps `, or -:py:class:`IPyLammps ` class is created, there are +Once an instance of the :py:class:`lammps ` class is created, there are multiple ways to "feed" it commands. In a way that is not very different from running a LAMMPS input script, except that Python has many more facilities for structured programming than the LAMMPS input script syntax. Furthermore it is possible to "compute" what the next LAMMPS command should be. -.. tabs:: +Same as in the equivalent :doc:`C library functions `, +commands can be read from a file, a single string, a list of strings and a +block of commands in a single multi-line string. They are processed under the +same boundary conditions as the C library counterparts. The example below +demonstrates the use of :py:func:`lammps.file()`, :py:func:`lammps.command()`, +:py:func:`lammps.commands_list()`, and :py:func:`lammps.commands_string()`: - .. tab:: lammps API +.. code-block:: python - Same as in the equivalent - :doc:`C library functions `, commands can be read from a file, a - single string, a list of strings and a block of commands in a single - multi-line string. They are processed under the same boundary conditions - as the C library counterparts. The example below demonstrates the use - of :py:func:`lammps.file()`, :py:func:`lammps.command()`, - :py:func:`lammps.commands_list()`, and :py:func:`lammps.commands_string()`: + from lammps import lammps + lmp = lammps() - .. code-block:: python + # read commands from file 'in.melt' + lmp.file('in.melt') - from lammps import lammps - lmp = lammps() + # issue a single command + lmp.command('variable zpos index 1.0') - # read commands from file 'in.melt' - lmp.file('in.melt') + # create 10 groups with 10 atoms each + cmds = [f"group g{i} id {10*i+1}:{10*(i+1)}" for i in range(10)] + lmp.commands_list(cmds) - # issue a single command - lmp.command('variable zpos index 1.0') + # run commands from a multi-line string + block = """ + clear + region box block 0 2 0 2 0 2 + create_box 1 box + create_atoms 1 single 1.0 1.0 ${zpos} + """ + lmp.commands_string(block) - # create 10 groups with 10 atoms each - cmds = ["group g{} id {}:{}".format(i,10*i+1,10*(i+1)) for i in range(10)] - lmp.commands_list(cmds) +For convenience, the :py:class:`lammps ` class also provides a +command wrapper ``cmd`` that turns any LAMMPS command into a regular function +call. - # run commands from a multi-line string - block = """ - clear - region box block 0 2 0 2 0 2 - create_box 1 box - create_atoms 1 single 1.0 1.0 ${zpos} - """ - lmp.commands_string(block) +For instance, the following LAMMPS command - .. tab:: PyLammps/IPyLammps API +.. code-block:: LAMMPS - Unlike the lammps API, the PyLammps/IPyLammps APIs allow running LAMMPS - commands by calling equivalent member functions of :py:class:`PyLammps ` - and :py:class:`IPyLammps ` instances. + region box block 0 10 0 5 -0.5 0.5 - For instance, the following LAMMPS command +would normally be executed with the following Python code: - .. code-block:: LAMMPS +.. code-block:: python - region box block 0 10 0 5 -0.5 0.5 + from lammps import lammps - can be executed using with the lammps API with the following Python code if ``lmp`` is an - instance of :py:class:`lammps `: + lmp = lammps() + lmp.command("region box block 0 10 0 5 -0.5 0.5") - .. code-block:: python +With the ``cmd`` wrapper, any LAMMPS command can be split up into arbitrary parts. +These parts are then passed to a member function with the name of the :doc:`command `. +For the :doc:`region ` command that means the :code:`region()` method can be called. +The arguments of the command can be passed as one string, or +individually. - from lammps import lammps +.. code-block:: python - lmp = lammps() - lmp.command("region box block 0 10 0 5 -0.5 0.5") + from lammps import lammps - With the PyLammps interface, any LAMMPS command can be split up into arbitrary parts. - These parts are then passed to a member function with the name of the :doc:`command `. - For the :doc:`region ` command that means the :code:`region()` method can be called. - The arguments of the command can be passed as one string, or - individually. + L = lammps() - .. code-block:: python + # pass command parameters as one string + L.cmd.region("box block 0 10 0 5 -0.5 0.5") - from lammps import PyLammps + # OR pass them individually + L.cmd.region("box block", 0, 10, 0, 5, -0.5, 0.5) - L = PyLammps() +In the latter example, all parameters except the first are Python floating-point literals. The +member function takes the entire parameter list and transparently merges it to a single command +string. - # pass command parameters as one string - L.region("box block 0 10 0 5 -0.5 0.5") +The benefit of this approach is avoiding redundant command calls and easier +parameterization. With `command`, `commands_list`, and `commands_string` the +parameterization needed to be done manually by creating formatted command +strings. - # OR pass them individually - L.region("box block", 0, 10, 0, 5, -0.5, 0.5) +.. code-block:: python - In the latter example, all parameters except the first are Python floating-point literals. The - member function takes the entire parameter list and transparently merges it to a single command - string. + lmp.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) - The benefit of this approach is avoiding redundant command calls and easier - parameterization. In the lammps API parameterization needed to be done - manually by creating formatted command strings. +In contrast, methods of the `cmd` wrapper accept parameters directly and will convert +them automatically to a final command string. - .. code-block:: python +.. code-block:: python - lmp.command("region box block %f %f %f %f %f %f" % (xlo, xhi, ylo, yhi, zlo, zhi)) + L.cmd.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) - In contrast, methods of PyLammps accept parameters directly and will convert - them automatically to a final command string. +.. note:: - .. code-block:: python + When running in IPython you can use Tab-completion after ``L.cmd.`` to see + all available LAMMPS commands. - L.region("box block", xlo, xhi, ylo, yhi, zlo, zhi) +Using these facilities, the previous example shown above can be rewritten as follows: - Using these facilities, the example shown for the lammps API can be rewritten as follows: +.. code-block:: python - .. code-block:: python + from lammps import lammps + L = lammps() - from lammps import PyLammps - L = PyLammps() + # read commands from file 'in.melt' + L.file('in.melt') - # read commands from file 'in.melt' - L.file('in.melt') + # issue a single command + L.cmd.variable('zpos', 'index', 1.0) - # issue a single command - L.variable('zpos', 'index', 1.0) + # create 10 groups with 10 atoms each + for i in range(10): + L.cmd.group(f"g{i}", "id", f"{10*i+1}:{10*(i+1)}") - # create 10 groups with 10 atoms each - for i in range(10): - L.group(f"g{i}", "id", f"{10*i+1}:{10*(i+1)}") - - L.clear() - L.region("box block", 0, 2, 0, 2, 0, 2) - L.create_box(1, "box") - L.create_atoms(1, "single", 1.0, 1.0, "${zpos}") + L.cmd.clear() + L.cmd.region("box block", 0, 2, 0, 2, 0, 2) + L.cmd.create_box(1, "box") + L.cmd.create_atoms(1, "single", 1.0, 1.0, "${zpos}") diff --git a/doc/src/Python_head.rst b/doc/src/Python_head.rst index 3aab3a0d4b..28b6f3d1d4 100644 --- a/doc/src/Python_head.rst +++ b/doc/src/Python_head.rst @@ -15,6 +15,7 @@ together. Python_call Python_formats Python_examples + Python_jupyter Python_error Python_trouble diff --git a/doc/src/Python_jupyter.rst b/doc/src/Python_jupyter.rst new file mode 100644 index 0000000000..9e9a1c917e --- /dev/null +++ b/doc/src/Python_jupyter.rst @@ -0,0 +1,45 @@ +Using LAMMPS in IPython notebooks and Jupyter +============================================= + +If the LAMMPS Python package is installed for the same Python interpreter as +`IPython `_, you can use LAMMPS directly inside of an IPython notebook inside of +Jupyter. `Jupyter `_ is a powerful integrated development environment (IDE) for +many dynamic languages like Python, Julia and others, which operates inside of +any web browser. Besides auto-completion and syntax highlighting it allows you +to create formatted documents using Markup, mathematical formulas, graphics and +animations intermixed with executable Python code. It is a great format for +tutorials and showcasing your latest research. + +The easiest way to install it is via ``pip``: + +.. code-block:: bash + + pip install --user jupyter + +To launch an instance of Jupyter simply run the following command inside your +Python environment: + +.. code-block:: bash + + jupyter notebook + +Interactive Python Examples +--------------------------- + +Examples of IPython notebooks can be found in the ``python/examples/ipython`` +subdirectory. They require LAMMPS to be compiled as shared library with PYTHON, +PNG, JPEG and FFMPEG support. + +To open these notebooks launch ``jupyter notebook index.ipynb`` inside this +directory. The opened file provides an overview of the available examples. + +- Example 1: Using LAMMPS with Python (``simple.ipynb``) +- Example 2: Analyzing LAMMPS thermodynamic data (``thermo.ipynb``) +- Example 3: Working with Per-Atom Data (``atoms.ipynb``) +- Example 4: Working with LAMMPS variables (``variables.ipynb``) +- Example 5: Validating a dihedral potential (``dihedrals/dihedral.ipynb``) +- Example 6: Running a Monte Carlo relaxation (``montecarlo/mc.ipynb``) + +.. note:: + + Typically clicking a link in Jupyter will open a new tab, which might be blocked by your pop-up blocker. diff --git a/doc/src/Python_module.rst b/doc/src/Python_module.rst index c19d4b0345..30e585d143 100644 --- a/doc/src/Python_module.rst +++ b/doc/src/Python_module.rst @@ -10,19 +10,11 @@ be installed into a Python system folder or a user folder with ``make install-python``. Components of the module can then loaded into a Python session with the ``import`` command. -There are multiple Python interface classes in the :py:mod:`lammps` module: +.. warning:: -- the :py:class:`lammps ` class. This is a wrapper around - the C-library interface and its member functions try to replicate the - :ref:`C-library API ` closely. This is the most - feature-complete Python API. -- the :py:class:`PyLammps ` class. This is a more high-level - and more Python style class implemented on top of the - :py:class:`lammps ` class. -- the :py:class:`IPyLammps ` class is derived from - :py:class:`PyLammps ` and adds embedded graphics - features to conveniently include LAMMPS into `Jupyter - `_ notebooks. + Alternative interfaces such as :py:class:`PyLammps ` and + :py:class:`IPyLammps ` classes have been deprecated and + will be removed in a future version of LAMMPS. .. _mpi4py_url: https://mpi4py.readthedocs.io @@ -49,7 +41,7 @@ The ``lammps`` class API ======================== The :py:class:`lammps ` class is the core of the LAMMPS -Python interfaces. It is a wrapper around the :ref:`LAMMPS C library +Python interface. It is a wrapper around the :ref:`LAMMPS C library API ` using the `Python ctypes module `_ and a shared library compiled from the LAMMPS sources code. The individual methods in this @@ -64,40 +56,7 @@ functions. Below is a detailed documentation of the API. .. autoclass:: lammps.numpy_wrapper::numpy_wrapper :members: ----------- - -The ``PyLammps`` class API -========================== - -The :py:class:`PyLammps ` class is a wrapper that creates a -simpler, more "Pythonic" interface to common LAMMPS functionality. LAMMPS -data structures are exposed through objects and properties. This makes Python -scripts shorter and more concise. See the :doc:`PyLammps Tutorial -` for an introduction on how to use this interface. - -.. autoclass:: lammps.PyLammps - :members: - -.. autoclass:: lammps.AtomList - :members: - -.. autoclass:: lammps.Atom - :members: - -.. autoclass:: lammps.Atom2D - :members: - ----------- - -The ``IPyLammps`` class API -=========================== - -The :py:class:`IPyLammps ` class is an extension of -:py:class:`PyLammps `, adding additional functions to -quickly display visualizations such as images and videos inside of IPython. -See the :doc:`PyLammps Tutorial ` for examples. - -.. autoclass:: lammps.IPyLammps +.. autoclass:: lammps.ipython::wrapper :members: ---------- diff --git a/doc/src/Python_objects.rst b/doc/src/Python_objects.rst index 6e3a329a27..c3002ec5e6 100644 --- a/doc/src/Python_objects.rst +++ b/doc/src/Python_objects.rst @@ -4,95 +4,52 @@ Compute, fixes, variables This section documents accessing or modifying data from objects like computes, fixes, or variables in LAMMPS using the :py:mod:`lammps` module. -.. tabs:: +For :py:meth:`lammps.extract_compute() ` and +:py:meth:`lammps.extract_fix() `, the global, per-atom, +or local data calculated by the compute or fix can be accessed. What is returned +depends on whether the compute or fix calculates a scalar or vector or array. +For a scalar, a single double value is returned. If the compute or fix calculates +a vector or array, a pointer to the internal LAMMPS data is returned, which you can +use via normal Python subscripting. - .. tab:: lammps API +The one exception is that for a fix that calculates a +global vector or array, a single double value from the vector or array +is returned, indexed by I (vector) or I and J (array). I,J are +zero-based indices. +See the :doc:`Howto output ` page for a discussion of +global, per-atom, and local data, and of scalar, vector, and array +data types. See the doc pages for individual :doc:`computes ` +and :doc:`fixes ` for a description of what they calculate and +store. - For :py:meth:`lammps.extract_compute() ` and - :py:meth:`lammps.extract_fix() `, the global, per-atom, - or local data calculated by the compute or fix can be accessed. What is returned - depends on whether the compute or fix calculates a scalar or vector or array. - For a scalar, a single double value is returned. If the compute or fix calculates - a vector or array, a pointer to the internal LAMMPS data is returned, which you can - use via normal Python subscripting. +For :py:meth:`lammps.extract_variable() `, +an :doc:`equal-style or atom-style variable ` is evaluated and +its result returned. - The one exception is that for a fix that calculates a - global vector or array, a single double value from the vector or array - is returned, indexed by I (vector) or I and J (array). I,J are - zero-based indices. - See the :doc:`Howto output ` page for a discussion of - global, per-atom, and local data, and of scalar, vector, and array - data types. See the doc pages for individual :doc:`computes ` - and :doc:`fixes ` for a description of what they calculate and - store. +For equal-style variables a single ``c_double`` value is returned and the +group argument is ignored. For atom-style variables, a vector of +``c_double`` is returned, one value per atom, which you can use via normal +Python subscripting. The values will be zero for atoms not in the +specified group. - For :py:meth:`lammps.extract_variable() `, - an :doc:`equal-style or atom-style variable ` is evaluated and - its result returned. +:py:meth:`lammps.numpy.extract_compute() `, +:py:meth:`lammps.numpy.extract_fix() `, and +:py:meth:`lammps.numpy.extract_variable() ` are +equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers. - For equal-style variables a single ``c_double`` value is returned and the - group argument is ignored. For atom-style variables, a vector of - ``c_double`` is returned, one value per atom, which you can use via normal - Python subscripting. The values will be zero for atoms not in the - specified group. +The :py:meth:`lammps.set_variable() ` method sets an +existing string-style variable to a new string value, so that subsequent LAMMPS +commands can access the variable. - :py:meth:`lammps.numpy.extract_compute() `, - :py:meth:`lammps.numpy.extract_fix() `, and - :py:meth:`lammps.numpy.extract_variable() ` are - equivalent NumPy implementations that return NumPy arrays instead of ``ctypes`` pointers. +**Methods**: - The :py:meth:`lammps.set_variable() ` method sets an - existing string-style variable to a new string value, so that subsequent LAMMPS - commands can access the variable. +* :py:meth:`lammps.extract_compute() `: extract value(s) from a compute +* :py:meth:`lammps.extract_fix() `: extract value(s) from a fix +* :py:meth:`lammps.extract_variable() `: extract value(s) from a variable +* :py:meth:`lammps.set_variable() `: set existing named string-style variable to value - **Methods**: +**NumPy Methods**: - * :py:meth:`lammps.extract_compute() `: extract value(s) from a compute - * :py:meth:`lammps.extract_fix() `: extract value(s) from a fix - * :py:meth:`lammps.extract_variable() `: extract value(s) from a variable - * :py:meth:`lammps.set_variable() `: set existing named string-style variable to value - - **NumPy Methods**: - - * :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays - * :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays - - - .. tab:: PyLammps/IPyLammps API - - PyLammps and IPyLammps classes currently do not add any additional ways of - retrieving information out of computes and fixes. This information can still be accessed by using the lammps API: - - .. code-block:: python - - L.lmp.extract_compute(...) - L.lmp.extract_fix(...) - # OR - L.lmp.numpy.extract_compute(...) - L.lmp.numpy.extract_fix(...) - - LAMMPS variables can be both defined and accessed via the :py:class:`PyLammps ` interface. - - To define a variable you can use the :doc:`variable ` command: - - .. code-block:: python - - L.variable("a index 2") - - A dictionary of all variables is returned by the :py:attr:`PyLammps.variables ` property: - - you can access an individual variable by retrieving a variable object from the - ``L.variables`` dictionary by name - - .. code-block:: python - - a = L.variables['a'] - - The variable value can then be easily read and written by accessing the value - property of this object. - - .. code-block:: python - - print(a.value) - a.value = 4 +* :py:meth:`lammps.numpy.extract_compute() `: extract value(s) from a compute, return arrays as numpy arrays +* :py:meth:`lammps.numpy.extract_fix() `: extract value(s) from a fix, return arrays as numpy arrays +* :py:meth:`lammps.numpy.extract_variable() `: extract value(s) from a variable, return arrays as numpy arrays diff --git a/doc/src/Python_overview.rst b/doc/src/Python_overview.rst index a13da0d512..85bc0d3bfa 100644 --- a/doc/src/Python_overview.rst +++ b/doc/src/Python_overview.rst @@ -56,7 +56,7 @@ Below is an example output for Python version 3.8.5. --------- -LAMMPS can work together with Python in three ways. First, Python can +LAMMPS can work together with Python in two ways. First, Python can wrap LAMMPS through the its :doc:`library interface `, so that a Python script can create one or more instances of LAMMPS and launch one or more simulations. In Python terms, this is referred to as @@ -67,22 +67,7 @@ launch one or more simulations. In Python terms, this is referred to as Launching LAMMPS via Python - -Second, the lower-level Python interface in the :py:class:`lammps Python -class ` can be used indirectly through the provided -:py:class:`PyLammps ` and :py:class:`IPyLammps -` wrapper classes, also written in Python. These -wrappers try to simplify the usage of LAMMPS in Python by providing a -more object-based interface to common LAMMPS functionality. They also -reduce the amount of code necessary to parameterize LAMMPS scripts -through Python and make variables and computes directly accessible. - -.. figure:: JPG/pylammps-invoke-lammps.png - :figclass: align-center - - Using the PyLammps / IPyLammps wrappers - -Third, LAMMPS can use the Python interpreter, so that a LAMMPS input +Second, LAMMPS can use the Python interpreter, so that a LAMMPS input script or styles can invoke Python code directly, and pass information back-and-forth between the input script and Python functions you write. This Python code can also call back to LAMMPS to query or change its diff --git a/doc/src/Python_properties.rst b/doc/src/Python_properties.rst index 031461660a..25040ad5bd 100644 --- a/doc/src/Python_properties.rst +++ b/doc/src/Python_properties.rst @@ -2,14 +2,8 @@ System properties ================= Similar to what is described in :doc:`Library_properties`, the instances of -:py:class:`lammps `, :py:class:`PyLammps `, or -:py:class:`IPyLammps ` can be used to extract different kinds -of information about the active LAMMPS instance and also to modify some of it. The -main difference between the interfaces is how the information is exposed. - -While the :py:class:`lammps ` is just a thin layer that wraps C API calls, -:py:class:`PyLammps ` and :py:class:`IPyLammps ` expose -information as objects and properties. +:py:class:`lammps ` can be used to extract different kinds +of information about the active LAMMPS instance and also to modify some of it. In some cases the data returned is a direct reference to the original data inside LAMMPS cast to ``ctypes`` pointers. Where possible, the wrappers will @@ -25,113 +19,38 @@ against invalid accesses. accordingly. These arrays can change sizes and order at every neighbor list rebuild and atom sort event as atoms are migrating between subdomains. -.. tabs:: +.. code-block:: python - .. tab:: lammps API + from lammps import lammps - .. code-block:: python + lmp = lammps() + lmp.file("in.sysinit") - from lammps import lammps + natoms = lmp.get_natoms() + print(f"running simulation with {natoms} atoms") - lmp = lammps() - lmp.file("in.sysinit") + lmp.command("run 1000 post no"); - natoms = lmp.get_natoms() - print(f"running simulation with {natoms} atoms") + for i in range(10): + lmp.command("run 100 pre no post no") + pe = lmp.get_thermo("pe") + ke = lmp.get_thermo("ke") + print(f"PE = {pe}\nKE = {ke}") - lmp.command("run 1000 post no"); + lmp.close() - for i in range(10): - lmp.command("run 100 pre no post no") - pe = lmp.get_thermo("pe") - ke = lmp.get_thermo("ke") - print(f"PE = {pe}\nKE = {ke}") +**Methods**: - lmp.close() +* :py:meth:`version() `: return the numerical version id, e.g. LAMMPS 2 Sep 2015 -> 20150902 +* :py:meth:`get_thermo() `: return current value of a thermo keyword +* :py:meth:`last_thermo() `: return a dictionary of the last thermodynamic output +* :py:meth:`get_natoms() `: total # of atoms as int +* :py:meth:`reset_box() `: reset the simulation box size +* :py:meth:`extract_setting() `: return a global setting +* :py:meth:`extract_global() `: extract a global quantity +* :py:meth:`extract_box() `: extract box info +* :py:meth:`create_atoms() `: create N atoms with IDs, types, x, v, and image flags - **Methods**: +**Properties**: - * :py:meth:`version() `: return the numerical version id, e.g. LAMMPS 2 Sep 2015 -> 20150902 - * :py:meth:`get_thermo() `: return current value of a thermo keyword - * :py:meth:`last_thermo() `: return a dictionary of the last thermodynamic output - * :py:meth:`get_natoms() `: total # of atoms as int - * :py:meth:`reset_box() `: reset the simulation box size - * :py:meth:`extract_setting() `: return a global setting - * :py:meth:`extract_global() `: extract a global quantity - * :py:meth:`extract_box() `: extract box info - * :py:meth:`create_atoms() `: create N atoms with IDs, types, x, v, and image flags - - **Properties**: - - * :py:attr:`last_thermo_step `: the last timestep thermodynamic output was computed - - .. tab:: PyLammps/IPyLammps API - - In addition to the functions provided by :py:class:`lammps `, :py:class:`PyLammps ` objects - have several properties which allow you to query the system state: - - L.system - Is a dictionary describing the system such as the bounding box or number of atoms - - L.system.xlo, L.system.xhi - bounding box limits along x-axis - - L.system.ylo, L.system.yhi - bounding box limits along y-axis - - L.system.zlo, L.system.zhi - bounding box limits along z-axis - - L.communication - configuration of communication subsystem, such as the number of threads or processors - - L.communication.nthreads - number of threads used by each LAMMPS process - - L.communication.nprocs - number of MPI processes used by LAMMPS - - L.fixes - List of fixes in the current system - - L.computes - List of active computes in the current system - - L.dump - List of active dumps in the current system - - L.groups - List of groups present in the current system - - **Retrieving the value of an arbitrary LAMMPS expressions** - - LAMMPS expressions can be immediately evaluated by using the ``eval`` method. The - passed string parameter can be any expression containing global :doc:`thermo` values, - variables, compute or fix data (see :doc:`Howto_output`): - - - .. code-block:: python - - result = L.eval("ke") # kinetic energy - result = L.eval("pe") # potential energy - - result = L.eval("v_t/2.0") - - **Example** - - .. code-block:: python - - from lammps import PyLammps - - L = PyLammps() - L.file("in.sysinit") - - print(f"running simulation with {L.system.natoms} atoms") - - L.run(1000, "post no"); - - for i in range(10): - L.run(100, "pre no post no") - pe = L.eval("pe") - ke = L.eval("ke") - print(f"PE = {pe}\nKE = {ke}") +* :py:attr:`last_thermo_step `: the last timestep thermodynamic output was computed diff --git a/doc/src/Run_basics.rst b/doc/src/Run_basics.rst index 22cec5858b..f18eaff75a 100644 --- a/doc/src/Run_basics.rst +++ b/doc/src/Run_basics.rst @@ -1,8 +1,8 @@ Basics of running LAMMPS ======================== -LAMMPS is run from the command line, reading commands from a file via -the ``-in`` command line flag, or from standard input. Using the ``-in +LAMMPS is run from the command-line, reading commands from a file via +the ``-in`` command-line flag, or from standard input. Using the ``-in in.file`` variant is recommended (see note below). The name of the LAMMPS executable is either ``lmp`` or ``lmp_`` with `` being the machine string used when compiling LAMMPS. This @@ -25,7 +25,7 @@ build LAMMPS: You normally run the LAMMPS command in the directory where your input script is located. That is also where output files are produced by default, unless you provide specific other paths in your input script or -on the command line. As in some of the examples above, the LAMMPS +on the command-line. As in some of the examples above, the LAMMPS executable itself can be placed elsewhere. .. note:: diff --git a/doc/src/Run_options.rst b/doc/src/Run_options.rst index 4f7021cd53..546658a644 100644 --- a/doc/src/Run_options.rst +++ b/doc/src/Run_options.rst @@ -632,7 +632,7 @@ the ``-package omp`` command-line switch or the :doc:`package omp ` com The :doc:`suffix ` command can also be used within an input script to set a suffix, or to turn off or back on any suffix setting -made via the command line. +made via the command-line. ---------- diff --git a/doc/src/Run_windows.rst b/doc/src/Run_windows.rst index 8c2eac2bc6..a03f27c013 100644 --- a/doc/src/Run_windows.rst +++ b/doc/src/Run_windows.rst @@ -20,7 +20,7 @@ To run with 4 threads, you can type this: lmp -in in.lj.lmp -k on t 4 -sf kk Alternately, you can also install a package with LAMMPS-GUI included and -open the LAMMPS-GUI app (the package includes the command line version +open the LAMMPS-GUI app (the package includes the command-line version of LAMMPS as well) and open the input file in the GUI and run it from there. For details on LAMMPS-GUI, see :doc:`Howto_lammps_gui`. diff --git a/doc/src/Speed_gpu.rst b/doc/src/Speed_gpu.rst index 42bd8bf059..c90e3fcc5e 100644 --- a/doc/src/Speed_gpu.rst +++ b/doc/src/Speed_gpu.rst @@ -31,7 +31,8 @@ Coulombics. It has the following general features: (for Nvidia GPUs, AMD GPUs, Intel GPUs, and multicore CPUs). so that the same functionality is supported on a variety of hardware. -**Required hardware/software:** +Required hardware/software +"""""""""""""""""""""""""" To compile and use this package in CUDA mode, you currently need to have an NVIDIA GPU and install the corresponding NVIDIA CUDA @@ -69,12 +70,14 @@ To compile and use this package in HIP mode, you have to have the AMD ROCm software installed. Versions of ROCm older than 3.5 are currently deprecated by AMD. -**Building LAMMPS with the GPU package:** +Building LAMMPS with the GPU package +"""""""""""""""""""""""""""""""""""" See the :ref:`Build extras ` page for instructions. -**Run with the GPU package from the command line:** +Run with the GPU package from the command-line +"""""""""""""""""""""""""""""""""""""""""""""" The ``mpirun`` or ``mpiexec`` command sets the total number of MPI tasks used by LAMMPS (one or multiple per compute node) and the number of MPI @@ -133,7 +136,8 @@ affect the setting for bonded interactions (LAMMPS default is "on"). The "off" setting for pairwise interaction is currently required for GPU package pair styles. -**Or run with the GPU package by editing an input script:** +Run with the GPU package by editing an input script +""""""""""""""""""""""""""""""""""""""""""""""""""" The discussion above for the ``mpirun`` or ``mpiexec`` command, MPI tasks/node, and use of multiple MPI tasks/GPU is the same. @@ -149,7 +153,8 @@ You must also use the :doc:`package gpu ` command to enable the GPU package, unless the ``-sf gpu`` or ``-pk gpu`` :doc:`command-line switches ` were used. It specifies the number of GPUs/node to use, as well as other options. -**Speed-ups to expect:** +Speed-up to expect +"""""""""""""""""" The performance of a GPU versus a multicore CPU is a function of your hardware, which pair style is used, the number of atoms/GPU, and the @@ -176,10 +181,13 @@ better with multiple OMP threads because the inter-process communication is higher for these styles with the GPU package in order to allow deterministic results. -**Guidelines for best performance:** +Guidelines for best performance +""""""""""""""""""""""""""""""" -* Using multiple MPI tasks per GPU will often give the best performance, - as allowed my most multicore CPU/GPU configurations. +* Using multiple MPI tasks (2-10) per GPU will often give the best + performance, as allowed my most multicore CPU/GPU configurations. + Using too many MPI tasks will result in worse performance due to + growing overhead with the growing number of MPI tasks. * If the number of particles per MPI task is small (e.g. 100s of particles), it can be more efficient to run with fewer MPI tasks per GPU, even if you do not use all the cores on the compute node. @@ -199,12 +207,13 @@ deterministic results. :doc:`angle `, :doc:`dihedral `, :doc:`improper `, and :doc:`long-range ` calculations will not be included in the "Pair" time. -* Since only part of the pppm kspace style is GPU accelerated, it - may be faster to only use GPU acceleration for Pair styles with - long-range electrostatics. See the "pair/only" keyword of the - package command for a shortcut to do that. The work between kspace - on the CPU and non-bonded interactions on the GPU can be balanced - through adjusting the coulomb cutoff without loss of accuracy. +* Since only part of the pppm kspace style is GPU accelerated, it may be + faster to only use GPU acceleration for Pair styles with long-range + electrostatics. See the "pair/only" keyword of the :doc:`package + command ` for a shortcut to do that. The distribution of + work between kspace on the CPU and non-bonded interactions on the GPU + can be balanced through adjusting the coulomb cutoff without loss of + accuracy. * When the *mode* setting for the package gpu command is force/neigh, the time for neighbor list calculations on the GPU will be added into the "Pair" time, not the "Neigh" time. An additional breakdown of the @@ -220,4 +229,6 @@ deterministic results. Restrictions """""""""""" -None. +When using :doc:`hybrid pair styles `, the neighbor list +must be generated on the host instead of the GPU and thus the potential +GPU acceleration is reduced. diff --git a/doc/src/Speed_intel.rst b/doc/src/Speed_intel.rst index dd6c27b4e7..78a88f4407 100644 --- a/doc/src/Speed_intel.rst +++ b/doc/src/Speed_intel.rst @@ -1,5 +1,5 @@ INTEL package -================== +============= The INTEL package is maintained by Mike Brown at Intel Corporation. It provides two methods for accelerating simulations, @@ -13,18 +13,18 @@ twice, once on the CPU and once with an offload flag. This allows LAMMPS to run on the CPU cores and co-processor cores simultaneously. Currently Available INTEL Styles -""""""""""""""""""""""""""""""""""""" +"""""""""""""""""""""""""""""""" * Angle Styles: charmm, harmonic -* Bond Styles: fene, fourier, harmonic +* Bond Styles: fene, harmonic * Dihedral Styles: charmm, fourier, harmonic, opls -* Fixes: nve, npt, nvt, nvt/sllod, nve/asphere +* Fixes: nve, npt, nvt, nvt/sllod, nve/asphere, electrode/conp, electrode/conq, electrode/thermo * Improper Styles: cvff, harmonic * Pair Styles: airebo, airebo/morse, buck/coul/cut, buck/coul/long, buck, dpd, eam, eam/alloy, eam/fs, gayberne, lj/charmm/coul/charmm, lj/charmm/coul/long, lj/cut, lj/cut/coul/long, lj/long/coul/long, - rebo, sw, tersoff -* K-Space Styles: pppm, pppm/disp + rebo, snap, sw, tersoff +* K-Space Styles: pppm, pppm/disp, pppm/electrode .. warning:: @@ -33,7 +33,7 @@ Currently Available INTEL Styles input requires it, LAMMPS will abort with an error message. Speed-up to expect -""""""""""""""""""" +"""""""""""""""""" The speedup will depend on your simulation, the hardware, which styles are used, the number of atoms, and the floating-point @@ -312,21 +312,21 @@ almost all cases. recommended, especially when running on a machine with Intel Hyper-Threading technology disabled. -Run with the INTEL package from the command line -""""""""""""""""""""""""""""""""""""""""""""""""""""" +Run with the INTEL package from the command-line +"""""""""""""""""""""""""""""""""""""""""""""""" -To enable INTEL optimizations for all available styles used in -the input script, the ``-sf intel`` :doc:`command-line switch ` can be used without any requirement for -editing the input script. This switch will automatically append -"intel" to styles that support it. It also invokes a default command: -:doc:`package intel 1 `. This package command is used to set -options for the INTEL package. The default package command will -specify that INTEL calculations are performed in mixed precision, -that the number of OpenMP threads is specified by the OMP_NUM_THREADS -environment variable, and that if co-processors are present and the -binary was built with offload support, that 1 co-processor per node -will be used with automatic balancing of work between the CPU and the -co-processor. +To enable INTEL optimizations for all available styles used in the input +script, the ``-sf intel`` :doc:`command-line switch ` can +be used without any requirement for editing the input script. This +switch will automatically append "intel" to styles that support it. It +also invokes a default command: :doc:`package intel 1 `. This +package command is used to set options for the INTEL package. The +default package command will specify that INTEL calculations are +performed in mixed precision, that the number of OpenMP threads is +specified by the OMP_NUM_THREADS environment variable, and that if +co-processors are present and the binary was built with offload support, +that 1 co-processor per node will be used with automatic balancing of +work between the CPU and the co-processor. You can specify different options for the INTEL package by using the ``-pk intel Nphi`` :doc:`command-line switch ` with diff --git a/doc/src/Speed_kokkos.rst b/doc/src/Speed_kokkos.rst index 40e748c9fe..9f8dcf8340 100644 --- a/doc/src/Speed_kokkos.rst +++ b/doc/src/Speed_kokkos.rst @@ -77,7 +77,7 @@ version 23 November 2023 and Kokkos version 4.2. rank. When running with multiple MPI ranks, you may see segmentation faults without GPU-aware MPI support. These can be avoided by adding the flags :doc:`-pk kokkos gpu/aware off ` to the - LAMMPS command line or by using the command :doc:`package kokkos + LAMMPS command-line or by using the command :doc:`package kokkos gpu/aware off ` in the input file. .. admonition:: Intel Data Center GPU support @@ -423,7 +423,7 @@ in the ``kokkos-cuda.cmake`` CMake preset file. cmake -DKokkos_ENABLE_CUDA=yes -DKokkos_ENABLE_OPENMP=yes ../cmake The suffix "/kk" is equivalent to "/kk/device", and for Kokkos CUDA, -using the ``-sf kk`` in the command line gives the default CUDA version +using the ``-sf kk`` in the command-line gives the default CUDA version everywhere. However, if the "/kk/host" suffix is added to a specific style in the input script, the Kokkos OpenMP (CPU) version of that specific style will be used instead. Set the number of OpenMP threads @@ -439,7 +439,7 @@ For example, the command to run with 1 GPU and 8 OpenMP threads is then: mpiexec -np 1 lmp_kokkos_cuda_openmpi -in in.lj -k on g 1 t 8 -sf kk -Conversely, if the ``-sf kk/host`` is used in the command line and then +Conversely, if the ``-sf kk/host`` is used in the command-line and then the "/kk" or "/kk/device" suffix is added to a specific style in your input script, then only that specific style will run on the GPU while everything else will run on the CPU in OpenMP mode. Note that the @@ -451,7 +451,7 @@ on the host CPU can overlap with a pair style running on the GPU. First compile with ``--default-stream per-thread`` added to ``CCFLAGS`` in the Kokkos CUDA Makefile. Then explicitly use the "/kk/host" suffix for kspace and bonds, angles, etc. in the input file and the -"kk" suffix (equal to "kk/device") on the command line. Also make +"kk" suffix (equal to "kk/device") on the command-line. Also make sure the environment variable ``CUDA_LAUNCH_BLOCKING`` is not set to "1" so CPU/GPU overlap can occur. diff --git a/doc/src/Speed_omp.rst b/doc/src/Speed_omp.rst index bf0744f0f4..906acf3ce8 100644 --- a/doc/src/Speed_omp.rst +++ b/doc/src/Speed_omp.rst @@ -21,7 +21,7 @@ Building LAMMPS with the OPENMP package See the :ref:`Build extras ` page for instructions. -Run with the OPENMP package from the command line +Run with the OPENMP package from the command-line """"""""""""""""""""""""""""""""""""""""""""""""""" These examples assume one or more 16-core nodes. diff --git a/doc/src/Speed_opt.rst b/doc/src/Speed_opt.rst index e115393017..15e18dad8e 100644 --- a/doc/src/Speed_opt.rst +++ b/doc/src/Speed_opt.rst @@ -17,7 +17,7 @@ Building LAMMPS with the OPT package See the :ref:`Build extras ` page for instructions. -Run with the OPT package from the command line +Run with the OPT package from the command-line """""""""""""""""""""""""""""""""""""""""""""" .. code-block:: bash diff --git a/doc/src/Tools.rst b/doc/src/Tools.rst index ba7cb2035a..d13a6d384f 100644 --- a/doc/src/Tools.rst +++ b/doc/src/Tools.rst @@ -501,7 +501,7 @@ Here are a few highlights of LAMMPS-GUI - Indicator for line that caused an error - Visualization of current state in Image Viewer (via calling :doc:`write_dump image `) - Capture of images created via :doc:`dump image ` in Slide show window -- Dialog to set variables, similar to the LAMMPS command line flag '-v' / '-var' +- Dialog to set variables, similar to the LAMMPS command-line flag '-v' / '-var' - Support for GPU, INTEL, KOKKOS/OpenMP, OPENMAP, and OPT and accelerator packages Parallelization @@ -550,7 +550,7 @@ will be found automatically. 2) you can download the `Flatpak file *flatpak* command: ``flatpak install --user LAMMPS-Linux-x86_64-GUI-.flatpak`` and run it with ``flatpak run org.lammps.lammps-gui``. The flatpak bundle also includes the -command line version of LAMMPS and some LAMMPS tools like msi2lmp. The +command-line version of LAMMPS and some LAMMPS tools like msi2lmp. The can be launched by using the ``--command`` flag. For example to run LAMMPS directly on the ``in.lj`` benchmark input you would type in the ``bench`` folder: ``flatpak run --command=lmp -in in.lj`` The flatpak @@ -608,10 +608,10 @@ would be the ``examples/COUPLE/plugin`` folder of the LAMMPS distribution. When compiling LAMMPS-GUI with plugin support, there is an additional -command line flag (``-p `` or ``--pluginpath ``) which +command-line flag (``-p `` or ``--pluginpath ``) which allows to override the path to LAMMPS shared library used by the GUI. This is usually auto-detected on the first run and can be changed in the -LAMMPS-GUI *Preferences* dialog. The command line flag allows to reset +LAMMPS-GUI *Preferences* dialog. The command-line flag allows to reset this path to a valid value in case the original setting has become invalid. An empty path ("") as argument restores the default setting. @@ -656,7 +656,7 @@ it will create a compressed ``LAMMPS-Win10-amd64.zip`` zip file with the executables and required dependent .dll files. This zip file can be uncompressed and ``lammps-gui.exe`` run directly from there. The uncompressed folder can be added to the ``PATH`` environment and LAMMPS -and LAMMPS-GUI can be launched from anywhere from the command line. +and LAMMPS-GUI can be launched from anywhere from the command-line. **MinGW64 Cross-compiler** @@ -876,7 +876,7 @@ the same ``LAMMPS_CACHING_DIR``. This script does the following: #. Start a simple local HTTP server using Python to host files for CMake Afterwards, it will print out instruction on how to modify the CMake -command line to make sure it uses the local HTTP server. +commands to make sure it uses the local HTTP server. To undo the environment changes and shutdown the local HTTP server, run the ``deactivate_caches`` command. @@ -1025,7 +1025,7 @@ with those in the provided log file with the same number of processors in the same subdirectory. If the differences between the actual and reference values are within specified tolerances, the test is considered passed. For each test batch, that is, a set of example input scripts, -the mpirun command, the LAMMPS command line arguments, and the +the mpirun command, the LAMMPS command-line arguments, and the tolerances for individual thermo quantities can be specified in a configuration file in YAML format. diff --git a/doc/src/angle_mwlc.rst b/doc/src/angle_mwlc.rst new file mode 100644 index 0000000000..9fa4171a63 --- /dev/null +++ b/doc/src/angle_mwlc.rst @@ -0,0 +1,94 @@ +.. index:: angle_style mwlc + +angle_style mwlc command +========================== + +Syntax +"""""" + +.. code-block:: LAMMPS + + angle_style mwlc + +Examples +"""""""" + +.. code-block:: LAMMPS + + angle_style mwlc + angle_coeff * 25 1 10 1 + +Description +""""""""""" + +.. versionadded:: TBD + +The *mwlc* angle style models a meltable wormlike chain and can be used +to model non-linear bending elasticity of polymers, e.g. DNA. *mwlc* +uses a potential that is a canonical-ensemble superposition of a +non-melted and a melted state :ref:`(Farrell) `. The potential +is + +.. math:: + + E = -k_{B}T\,\log [q + q^{m}] + E_{0}, + +where the non-melted and melted partition functions are + +.. math:: + q = \exp [-k_{1}(1+\cos{\theta})/k_{B}T]; \\ + q^{m} = \exp [-(\mu+k_{2}(1+\cos{\theta}))/k_{B}T]. + +:math:`k_1` is the bending elastic constant of the non-melted state, +:math:`k_2` is the bending elastic constant of the melted state, +:math:`\mu` is the melting energy, and +:math:`T` is the reference temperature. +The reference energy, + +.. math:: + E_{0} = -k_{B}T\,\log [1 + \exp[-\mu/k_{B}T]], + +ensures that E is zero for a fully extended chain. + +This potential is a continuous version of the two-state potential +introduced by :ref:`(Yan) `. + +The following coefficients must be defined for each angle type via the +:doc:`angle_coeff ` command as in the example above, or in +the data file or restart files read by the :doc:`read_data ` +or :doc:`read_restart ` commands: + +* :math:`k_1` (energy) +* :math:`k_2` (energy) +* :math:`\mu` (energy) +* :math:`T` (temperature) + +---------- + + +Restrictions +"""""""""""" + +This angle style can only be used if LAMMPS was built with the +EXTRA-MOLECULE package. See the :doc:`Build package ` +doc page for more info. + +Related commands +"""""""""""""""" + +:doc:`angle_coeff ` + +Default +""""""" + +none + +---------- + +.. _Farrell: + +**(Farrell)** `Farrell, Dobnikar, Podgornik, Curk, Phys Rev Lett, 133, 148101 (2024). `_ + +.. _Yan: + +**(Yan)** `Yan, Marko, Phys Rev Lett, 93, 108108 (2004). `_ diff --git a/doc/src/angle_style.rst b/doc/src/angle_style.rst index dc16a3fbaa..ab7671c75d 100644 --- a/doc/src/angle_style.rst +++ b/doc/src/angle_style.rst @@ -94,6 +94,7 @@ of (g,i,k,o,t) to indicate which accelerated styles exist. * :doc:`lepton ` - angle potential from evaluating a string * :doc:`mesocnt ` - piecewise harmonic and linear angle for bending-buckling of nanotubes * :doc:`mm3 ` - anharmonic angle +* :doc:`mwlc ` - meltable wormlike chain * :doc:`quartic ` - angle with cubic and quartic terms * :doc:`spica ` - harmonic angle with repulsive SPICA pair style between 1-3 atoms * :doc:`table ` - tabulated by angle diff --git a/doc/src/compute_temp_chunk.rst b/doc/src/compute_temp_chunk.rst index 33eab04343..3d97ad1699 100644 --- a/doc/src/compute_temp_chunk.rst +++ b/doc/src/compute_temp_chunk.rst @@ -184,11 +184,24 @@ temp/chunk calculation to a file is to use the The keyword/value option pairs are used in the following ways. The *com* keyword can be used with a value of *yes* to subtract the -velocity of the center-of-mass for each chunk from the velocity of the -atoms in that chunk, before calculating either the global or per-chunk -temperature. This can be useful if the atoms are streaming or +velocity of the center-of-mass (VCM) for each chunk from the velocity of +the atoms in that chunk, before calculating either the global or per-chunk +temperature. This can be useful if the atoms are streaming or otherwise moving collectively, and you wish to calculate only the -thermal temperature. +thermal temperature. This per-chunk VCM bias can be used in other fixes and +computes that can incorporate a temperature bias. If this compute is used +as a temperature bias in other commands then this bias is subtracted from +each atom, the command runs with the remaining thermal velocities, and +then the bias is added back in. This includes thermostatting +fixes like :doc:`fix nvt `, +:doc:`fix temp/rescale `, +:doc:`fix temp/berendsen `, and +:doc:`fix langevin `, and computes like +:doc:`compute stress/atom ` and +:doc:`compute pressure `. See the input script in +examples/stress_vcm for an example of how to use the *com* keyword in +conjunction with compute stress/atom to create a stress profile of a rigid +body while removing the overall motion of the rigid body. For the *bias* keyword, *bias-ID* refers to the ID of a temperature compute that removes a "bias" velocity from each atom. This also diff --git a/doc/src/dump_image.rst b/doc/src/dump_image.rst index 2336a5889a..8313d3cccb 100644 --- a/doc/src/dump_image.rst +++ b/doc/src/dump_image.rst @@ -681,7 +681,7 @@ MPEG or other movie file you can use: * c) Use FFmpeg - FFmpeg is a command line tool that is available on many platforms and + FFmpeg is a command-line tool that is available on many platforms and allows extremely flexible encoding and decoding of movies. .. code-block:: bash diff --git a/doc/src/fix_adapt.rst b/doc/src/fix_adapt.rst index 1ddf80cbdb..93bd8da041 100644 --- a/doc/src/fix_adapt.rst +++ b/doc/src/fix_adapt.rst @@ -406,6 +406,8 @@ sub-style name. The angle styles that currently work with fix adapt are: +--------------------------------------------------------------------+--------------------+-------------+ | :doc:`mm3 ` | k,theta0 | type angles | +--------------------------------------------------------------------+--------------------+-------------+ +| :doc:`mwlc ` | k1,k2,mu,T | type angles | ++--------------------------------------------------------------------+--------------------+-------------+ | :doc:`quartic ` | k2,k3,k4,theta0 | type angles | +--------------------------------------------------------------------+--------------------+-------------+ | :doc:`spica ` | k,theta0 | type angles | diff --git a/doc/src/fix_imd.rst b/doc/src/fix_imd.rst index 520af505a1..a4d7d9d387 100644 --- a/doc/src/fix_imd.rst +++ b/doc/src/fix_imd.rst @@ -26,6 +26,29 @@ Syntax *nowait* arg = *on* or *off* off = LAMMPS waits to be connected to an IMD client before continuing (default) on = LAMMPS listens for an IMD client, but continues with the run + *version* arg = *2* or *3* + 2 = use IMD protocol version 2 (default) + 3 = use IMD protocol version 3. + + The following keywords are only supported for IMD protocol version 3. + + .. parsed-literal:: + + *time* arg = *on* or *off* + off = simulation time is not transmitted (default) + on = simulation time is transmitted. + *box* arg = *on* or *off* + off = simulation box data is not transmitted (default) + on = simulation box data is transmitted. + *coordinates* arg = *on* or *off* + off = atomic coordinates are not transmitted (default) + on = atomic coordinates are transmitted. + *velocities* arg = *on* or *off* + off = atomic velocities are not transmitted (default) + on = atomic velocities are transmitted. + *forces* arg = *on* or *off* + off = atomic forces are not transmitted (default) + on = atomic forces are transmitted. Examples """""""" @@ -40,16 +63,19 @@ Description This fix implements the "Interactive MD" (IMD) protocol which allows realtime visualization and manipulation of MD simulations through the -IMD protocol, as initially implemented in VMD and NAMD. Specifically -it allows LAMMPS to connect an IMD client, for example the `VMD visualization program `_, so that it can monitor the progress of the -simulation and interactively apply forces to selected atoms. +IMD protocol, as initially implemented in VMD and NAMD. Specifically it +allows LAMMPS to connect an IMD client, for example the `VMD +visualization program `_ (currently only supports IMDv2) or the +`Python IMDClient `_ (supports both IMDv2 and IMDv3), so +that it can monitor the progress of the simulation and interactively +apply forces to selected atoms. -If LAMMPS is compiled with the pre-processor flag -DLAMMPS_ASYNC_IMD -then fix imd will use POSIX threads to spawn a IMD communication -thread on MPI rank 0 in order to offload data reading and writing -from the main execution thread and potentially lower the inferred -latencies for slow communication links. This feature has only been -tested under linux. +If LAMMPS is compiled with the pre-processor flag +:ref:`-DLAMMPS_ASYNC_IMD ` then fix imd will use POSIX threads to +spawn an IMD communication thread on MPI rank 0 in order to offload data +exchange with the IMD client from the main execution thread and +potentially lower the inferred latencies for slow communication +links. This feature has only been tested under linux. The source code for this fix includes code developed by the Theoretical and Computational Biophysics Group in the Beckman Institute for Advanced @@ -94,10 +120,19 @@ with different units or as a measure to tweak the forces generated by the manipulation of the IMD client, this option allows to make adjustments. +.. versionadded:: TBD + +In `IMDv3 `_, the IMD protocol has been extended to allow for +the transmission of simulation time, box dimensions, atomic coordinates, +velocities, and forces. The *version* keyword allows to select the +version of the protocol to be used. The *time*, *box*, *coordinates*, +*velocities*, and *forces* keywords allow to select which data is +transmitted to the IMD client. The default is to transmit all data. + To connect VMD to a listening LAMMPS simulation on the same machine with fix imd enabled, one needs to start VMD and load a coordinate or topology file that matches the fix group. When the VMD command -prompts appears, one types the command line: +prompts appears, one types the command: .. parsed-literal:: @@ -129,6 +164,10 @@ screen output is active. .. _imdvmd: https://www.ks.uiuc.edu/Research/vmd/imd/ +.. _IMDClient: https://github.com/Becksteinlab/imdclient/tree/main/imdclient + +.. _IMDv3: https://imdclient.readthedocs.io/en/latest/protocol_v3.html + Restart, fix_modify, output, run start/stop, minimize info """"""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -147,14 +186,14 @@ This fix is part of the MISC package. It is only enabled if LAMMPS was built with that package. See the :doc:`Build package ` page for more info. -When used in combination with VMD, a topology or coordinate file has -to be loaded, which matches (in number and ordering of atoms) the -group the fix is applied to. The fix internally sorts atom IDs by -ascending integer value; in VMD (and thus the IMD protocol) those will -be assigned 0-based consecutive index numbers. +When used in combination with VMD, a topology or coordinate file has to +be loaded, which matches (in number and ordering of atoms) the group the +fix is applied to. The fix internally sorts atom IDs by ascending +integer value; in VMD (and thus the IMD protocol) those will be assigned +0-based consecutive index numbers. When using multiple active IMD connections at the same time, each -needs to use a different port number. +fix instance needs to use a different port number. Related commands """""""""""""""" diff --git a/doc/src/fix_precession_spin.rst b/doc/src/fix_precession_spin.rst index 7440989d7a..88325ba491 100644 --- a/doc/src/fix_precession_spin.rst +++ b/doc/src/fix_precession_spin.rst @@ -135,7 +135,7 @@ directions for the forces. Only the direction of the vector is important; its length is ignored (the entered vectors are normalized). -Those styles can be combined within one single command line. +Those styles can be combined within one single command. .. note:: diff --git a/doc/src/kim_commands.rst b/doc/src/kim_commands.rst index ab9b4486a2..20728dee06 100644 --- a/doc/src/kim_commands.rst +++ b/doc/src/kim_commands.rst @@ -1084,10 +1084,11 @@ the form of *key_name_key*-*key_name_value* pairs). For example, kim property modify 1 key mass source-value 26.98154 kim property modify 1 key mass source-unit amu -where the special keyword "key" is followed by a *key_name* ("species" or -"mass" in the above) and one or more key-value pairs. These key-value pairs -may continue until either another "key" keyword is given or the end of the -command line is reached. Thus, the above could equivalently be written as +where the special keyword "key" is followed by a *key_name* ("species" +or "mass" in the above) and one or more key-value pairs. These +key-value pairs may continue until either another "key" keyword is given +or the end of the line is reached. Thus, the above could equivalently +be written as .. code-block:: LAMMPS diff --git a/doc/src/label.rst b/doc/src/label.rst index bf232431d7..cae1386e90 100644 --- a/doc/src/label.rst +++ b/doc/src/label.rst @@ -24,12 +24,12 @@ Description """"""""""" Label this line of the input script with the chosen ID. Unless a jump -command was used previously, this does nothing. But if a -:doc:`jump ` command was used with a label argument to begin -invoking this script file, then all command lines in the script prior -to this line will be ignored. I.e. execution of the script will begin -at this line. This is useful for looping over a section of the input -script as discussed in the :doc:`jump ` command. +command was used previously, this does nothing. But if a :doc:`jump +` command was used with a label argument to begin invoking this +script file, then all commands in the script prior to this line will be +ignored. I.e. execution of the script will begin at this line. This is +useful for looping over a section of the input script as discussed in +the :doc:`jump ` command. Restrictions """""""""""" diff --git a/doc/src/package.rst b/doc/src/package.rst index 8be6639e72..ddb3656027 100644 --- a/doc/src/package.rst +++ b/doc/src/package.rst @@ -504,7 +504,7 @@ as it is for non-accelerated pair styles The *binsize* keyword sets the size of bins used to bin atoms during neighbor list builds. The same value can be set by the :doc:`neigh_modify binsize ` command. Making it an option -in the package kokkos command allows it to be set from the command line. +in the package kokkos command allows it to be set from the command-line. The default value for CPUs is 0.0, which means the LAMMPS default will be used, which is bins = 1/2 the size of the pairwise cutoff + neighbor skin distance. This is fine when neighbor lists are built on the CPU. For GPU @@ -664,7 +664,7 @@ too. Also note that if the :doc:`-sf hybrid intel omp command-line switch ` is used, it invokes a "package intel" command, followed by a "package omp" command, both with a setting of *Nthreads* = 0. Likewise for a hybrid suffix for gpu and omp. Note that KOKKOS also supports - setting the number of OpenMP threads from the command line using the + setting the number of OpenMP threads from the command-line using the "-k on" :doc:`command-line switch `. The default for KOKKOS is 1 thread per MPI task, so any other number of threads should be explicitly set using the "-k on" command-line switch (and this diff --git a/doc/src/pair_mgpt.rst b/doc/src/pair_mgpt.rst index 6ef9f52fac..92bf9cd738 100644 --- a/doc/src/pair_mgpt.rst +++ b/doc/src/pair_mgpt.rst @@ -111,8 +111,8 @@ routines. For x-86 machines, there is a provided Makefile.mgptfast which enables the fast algebra routines, i.e. build LAMMPS with "make mgptfast". The user will be informed in the output files of the matrix kernels in use. To further improve speed, on x86 the option -precision single can be added to the :doc:`pair_coeff ` -command line, which improves speed (up to a factor of two) at the cost +*precision single* can be added to the :doc:`pair_coeff ` +command, which improves speed (up to a factor of two) at the cost of doing matrix calculations with 7 digit precision instead of the default 16. For consistency the default option can be specified explicitly by the option precision double. diff --git a/doc/src/pair_thole.rst b/doc/src/pair_thole.rst index a4e8bbb96e..5b0d5653ff 100644 --- a/doc/src/pair_thole.rst +++ b/doc/src/pair_thole.rst @@ -131,7 +131,7 @@ command. * LJ cutoff (distance units) The last two coefficients are optional and default to the global values from -the *pair_style* command line. +the *pair_style* command. ---------- diff --git a/doc/src/read_restart.rst b/doc/src/read_restart.rst index 6a875550a4..eb7f577e4d 100644 --- a/doc/src/read_restart.rst +++ b/doc/src/read_restart.rst @@ -48,9 +48,9 @@ meaning that the trajectories of a restarted run will precisely match those produced by the original run had it continued on. Some information about a restart file can be gathered directly from the -command line when using LAMMPS with the :ref:`-restart2info -` command line flag. On Unix-like operating systems (like -Linux or macOS), one can also :ref:`configure the "file" command line +command-line when using LAMMPS with the :ref:`-restart2info +` command-line flag. On Unix-like operating systems (like +Linux or macOS), one can also :ref:`configure the "file" command-line program ` to display basic information about a restart file The binary restart file format was not designed with backward, forward, @@ -60,9 +60,9 @@ Changes to the architecture, compilation settings, or LAMMPS version can render a restart file unreadable or it may read the data incorrectly. If you want a more portable format, you can use the data file format as created by the :doc:`write_data ` command. Binary restart -files can also be converted into a data file from the command line by +files can also be converted into a data file from the command-line by the LAMMPS executable that wrote them using the :ref:`-restart2data -` command line flag. +` command-line flag. Several things can prevent exact restarts due to round-off effects, in which case the trajectories in the 2 runs will slowly diverge. These diff --git a/doc/src/suffix.rst b/doc/src/suffix.rst index 6c723657a5..98bb43a02d 100644 --- a/doc/src/suffix.rst +++ b/doc/src/suffix.rst @@ -30,7 +30,7 @@ Description This command allows you to use variants of various styles if they exist. In that respect it operates the same as the :doc:`-suffix command-line switch `. It also has options to turn -off or back on any suffix setting made via the command line. +off or back on any suffix setting made via the command-line. The specified style can be *gpu*, *intel*, *kk*, *omp*, *opt* or *hybrid*\ . These refer to optional packages that LAMMPS can be built diff --git a/doc/src/variable.rst b/doc/src/variable.rst index 1867532efa..3ec3ce4a65 100644 --- a/doc/src/variable.rst +++ b/doc/src/variable.rst @@ -71,9 +71,9 @@ Syntax feature functions = is_available(category,feature), is_active(category,feature), is_defined(category,id) atom value = id[i], mass[i], type[i], mol[i], x[i], y[i], z[i], vx[i], vy[i], vz[i], fx[i], fy[i], fz[i], q[i] atom vector = id, mass, type, mol, radius, q, x, y, z, vx, vy, vz, fx, fy, fz - custom atom property = i_name, d_name, i_name[i], d_name[i], i2_name[i], d2_name[i], i2_name[i][j], d_name[i][j] - compute references = c_ID, c_ID[i], c_ID[i][j], C_ID, C_ID[i] - fix references = f_ID, f_ID[i], f_ID[i][j], F_ID, F_ID[i] + custom atom property = i_name, d_name, i_name[i], d_name[i], i2_name[i], d2_name[i], i2_name[i][j], d2_name[i][j] + compute references = c_ID, c_ID[i], c_ID[i][j], C_ID, C_ID[i], C_ID[i][j] + fix references = f_ID, f_ID[i], f_ID[i][j], F_ID, F_ID[i], F_ID[i][j] variable references = v_name, v_name[i] vector initialization = [1,3,7,10] (for *vector* variables only) diff --git a/doc/utils/requirements.txt b/doc/utils/requirements.txt index acf575fe58..d842f47c11 100644 --- a/doc/utils/requirements.txt +++ b/doc/utils/requirements.txt @@ -9,3 +9,4 @@ Pygments six pyyaml linkchecker +ipython diff --git a/doc/utils/sphinx-config/false_positives.txt b/doc/utils/sphinx-config/false_positives.txt index b0831040d1..f272af6dec 100644 --- a/doc/utils/sphinx-config/false_positives.txt +++ b/doc/utils/sphinx-config/false_positives.txt @@ -2126,6 +2126,7 @@ Marchi Mariella Marinica Markland +Marko Marrink Marroquin Marsaglia @@ -2208,6 +2209,7 @@ Meissner Melchor Meloni Melrose +meltable mem Mem memalign @@ -2418,6 +2420,7 @@ mV Mvapich mvh mvv +mwlc MxN myCompute myIndex @@ -2942,6 +2945,7 @@ Pmoltrans pN png podd +Podgornik Podhorszki Poiseuille poisson @@ -4128,6 +4132,7 @@ workflow workflows Workum Worley +wormlike wpbe Wriggers writedata @@ -4197,6 +4202,7 @@ yaff YAFF Yamada yaml +Yan Yanxon Yaser Yazdani diff --git a/examples/COUPLE/plugin/liblammpsplugin.c b/examples/COUPLE/plugin/liblammpsplugin.c index 5b1308a5cc..99e38df32b 100644 --- a/examples/COUPLE/plugin/liblammpsplugin.c +++ b/examples/COUPLE/plugin/liblammpsplugin.c @@ -117,6 +117,7 @@ liblammpsplugin_t *liblammpsplugin_load(const char *lib) ADDSYM(set_string_variable); ADDSYM(set_internal_variable); ADDSYM(variable_info); + ADDSYM(eval); ADDSYM(gather_atoms); ADDSYM(gather_atoms_concat); diff --git a/examples/COUPLE/plugin/liblammpsplugin.h b/examples/COUPLE/plugin/liblammpsplugin.h index b2df98d630..dd1c9628f5 100644 --- a/examples/COUPLE/plugin/liblammpsplugin.h +++ b/examples/COUPLE/plugin/liblammpsplugin.h @@ -163,6 +163,7 @@ struct _liblammpsplugin { int (*set_string_variable)(void *, const char *, const char *); int (*set_internal_variable)(void *, const char *, double); int (*variable_info)(void *, int, char *, int); + double (*eval)(void *, const char *); void (*gather_atoms)(void *, const char *, int, int, void *); void (*gather_atoms_concat)(void *, const char *, int, int, void *); diff --git a/examples/PACKAGES/imd/deca-ala-solv_imd_v3.py b/examples/PACKAGES/imd/deca-ala-solv_imd_v3.py new file mode 100644 index 0000000000..176e15e811 --- /dev/null +++ b/examples/PACKAGES/imd/deca-ala-solv_imd_v3.py @@ -0,0 +1,13 @@ +""" +For use with 'in.deca-ala-solv_imd_v3'. + +Tested with imdclient v0.1.4 and MDAnalysis v2.8.0 +""" +from imdclient.IMD import IMDReader +import MDAnalysis as mda + +u = mda.Universe('data.deca-ala-solv', "imd://localhost:5678", topology_format='DATA') + +for ts in u.trajectory: + print(ts.time) + print(ts.velocities) \ No newline at end of file diff --git a/examples/PACKAGES/imd/in.deca-ala-solv_imd_v3 b/examples/PACKAGES/imd/in.deca-ala-solv_imd_v3 new file mode 100644 index 0000000000..a7e8244cdc --- /dev/null +++ b/examples/PACKAGES/imd/in.deca-ala-solv_imd_v3 @@ -0,0 +1,31 @@ +# +units real +neighbor 2.5 bin +neigh_modify delay 1 every 1 + +atom_style full +bond_style harmonic +angle_style charmm +dihedral_style charmm +improper_style harmonic + +pair_style lj/charmm/coul/long 8 10 +pair_modify mix arithmetic +special_bonds charmm +read_data data.deca-ala-solv + + +group peptide id <= 103 +fix rigidh all shake 1e-6 100 1000 t 1 2 3 4 5 a 23 + +thermo 100 +thermo_style multi +timestep 2.0 +kspace_style pppm 1e-5 + +fix ensemble all npt temp 300.0 300.0 100.0 iso 1.0 1.0 1000.0 drag 0.2 + +# IMD setup. Client code available in 'deca-ala-solv_imd_v3.py' +fix comm all imd 5678 unwrap on trate 10 version 3 time on box on coordinates on velocities on forces off + +run 5000000 \ No newline at end of file diff --git a/examples/README b/examples/README index 90831b49f0..633fbbc49d 100644 --- a/examples/README +++ b/examples/README @@ -112,6 +112,7 @@ snap: examples for using several bundled SNAP potentials srd: stochastic rotation dynamics (SRD) particles as solvent steinhardt: Steinhardt-Nelson Q_l and W_l parameters usng orientorder/atom streitz: Streitz-Mintmire potential for Al2O3 +stress_vcm: removing binned rigid body motion from binned stress profile tad: temperature-accelerated dynamics of vacancy diffusion in bulk Si template: examples for using atom_style template and comparing to atom style molecular tersoff: regression test input for Tersoff variants diff --git a/examples/stress_vcm/README b/examples/stress_vcm/README new file mode 100644 index 0000000000..aeb065c678 --- /dev/null +++ b/examples/stress_vcm/README @@ -0,0 +1,32 @@ +README stress_vcm +================= + +Contents: + +- in.stress_vcm: Example script showing how to remove binned + velocities of center of mass (VCM) from stress calculations. +- stress_comparison.19Nov24.png: Plot shows the stress + calculated in bars on the y axis for each positional bin on + the x axis. Plotted are three different time steps from + stress profiles with and without the VCM removed. Plot + generated using Python. +- stress_xx.19Nov24.out: Output file generated by fix ave/time. +- log.19Nov24.stress_vcm.g++.1: LAMMPS log file with 1 proc. +- log.19Nov24.stress_vcm.g++.4: LAMMPS log file with 4 procs. + +Notes: + +- Running this script as-is will generate two files. A log + file with thermodynamic data and a stress_xx.out file + containing the binned stress profile with the VCM removed. +- To generate the binned stress profile without removing the + VCM then the compute stress/atom command at step three + needs the last keyword "ch_temp_vcm" to be replaced with + "NULL". +- Uncommenting the line under "Atom dump" will generate an + all atom dump file every 50 time steps containing atom ID, + type, and xyz coordinates. +- Uncommenting the lines under "Image dumps" will generate + .jpg image files every 250 timesteps. +- Uncommenting lines under "Movie dump" will generate a .avi + movie file showing timesteps every 125 timesteps. \ No newline at end of file diff --git a/examples/stress_vcm/in.stress_vcm b/examples/stress_vcm/in.stress_vcm new file mode 100644 index 0000000000..f2ecdb29a7 --- /dev/null +++ b/examples/stress_vcm/in.stress_vcm @@ -0,0 +1,113 @@ +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +processors 1 * * + +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +create_atoms 1 region box2 + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 + +#--------------------------------------# +# Chunk, Stress, and VCM removal steps # +#--------------------------------------# + +# 1. Create 20 equispaced bins sliced along the x direction. +# -"units reduced" normalizes the distance from 0.0 to 1.0 +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. +compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# -The velocities from specified temp-ID are used to compute stress. +# -Stress/atom units are pressure*volume! Optionally handled next step. +compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) + +# 5. Sum the per atom stresses in each bin. +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# 6. Average and output to file. +# -The average output is every 100 steps with samples collected 20 times with 5 step intervals. +fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +#--------------------------------------# + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type & +# axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 + +unfix fix_piston + +run 1500 diff --git a/examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 new file mode 100644 index 0000000000..93ae029334 --- /dev/null +++ b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.1 @@ -0,0 +1,253 @@ +LAMMPS (19 Nov 2024) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +Lattice spacing in x,y,z = 5.3589 5.3589 5.3589 +processors 1 * * + +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +Created orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + 1 by 1 by 1 MPI processor grid +create_atoms 1 region box2 +Created 7200 atoms + using lattice units in orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + create_atoms CPU = 0.002 seconds + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 12 + ghost atom cutoff = 12 + binsize = 6, bins = 25 11 11 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 5.721 | 5.721 | 5.721 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 600 -2252.7567 0 -1694.4304 -974.62456 + 100 284.72172 -1977.4291 0 -1712.483 2453.7429 + 200 304.44519 -1994.7937 0 -1711.4941 1822.2699 + 300 304.28012 -1993.2958 0 -1710.1498 1498.3794 + 400 296.76492 -1985.1364 0 -1708.9836 1259.9474 + 500 295.00895 -1982.4224 0 -1707.9036 964.9526 +Loop time of 3.01696 on 1 procs for 500 steps with 7200 atoms + +Performance: 28.638 ns/day, 0.838 hours/ns, 165.730 timesteps/s, 1.193 Matom-step/s +99.4% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 2.8439 | 2.8439 | 2.8439 | 0.0 | 94.26 +Neigh | 0.11212 | 0.11212 | 0.11212 | 0.0 | 3.72 +Comm | 0.015585 | 0.015585 | 0.015585 | 0.0 | 0.52 +Output | 0.003747 | 0.003747 | 0.003747 | 0.0 | 0.12 +Modify | 0.026097 | 0.026097 | 0.026097 | 0.0 | 0.87 +Other | | 0.01551 | | | 0.51 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6410 ave 6410 max 6410 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 615095 ave 615095 max 615095 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 615095 +Ave neighs/atom = 85.429861 +Neighbor list builds = 9 +Dangerous builds = 0 + +#------------------------------------# +# Chunk, Stress, and VCM removal steps +#------------------------------------# + +# 1. Create 20 equispaced bins sliced along the x direction. +# "units reduced" normalizes the distance from 0 to 1 +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +variable volfrac equal 1/(vol*0.05) +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced +compute ch_id all chunk/atom bin/1d x lower 0.05 units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. +compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# The velocities from specified temp-ID are used to compute stress +# Stress/atom units are pressure*volume! Optionally handled next step. +compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) +variable stress atom -(c_atom_stress_vcm[1])/(vol*0.05) + +# 5. Sum the per atom stresses in each bin. +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# 6. Average and output to file. +# The average output is every 100 steps with samples collected 20 times with 5 step intervals +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +#------------------------------------# + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +863 atoms in group piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: One or more atoms are time integrated more than once (src/modify.cpp:296) +Per MPI rank memory allocation (min/avg/max) = 6.975 | 6.975 | 6.975 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 500 295.00895 274.51875 -1982.4224 144.6903 64.3068 64.3068 631.89976 1127.2965 1135.6616 -1707.9036 + 600 357.38902 332.56613 -1951.3422 144.6903 64.3068 64.3068 2236.6706 2003.2726 1943.6815 -1618.7761 + 700 420.30268 391.11005 -1911.8178 144.6903 64.3068 64.3068 3761.5011 3065.4699 3140.3169 -1520.7077 + 800 484.96279 451.27911 -1875.379 144.6903 64.3068 64.3068 5362.254 4174.4201 4166.0818 -1424.0999 + 900 587.78954 546.96391 -1871.217 144.6903 64.3068 64.3068 6481.4714 4875.705 4676.6083 -1324.2531 + 1000 684.07997 636.56636 -1868.1639 144.6903 64.3068 64.3068 7734.6158 5271.3524 5272.1276 -1231.5975 +Loop time of 3.09383 on 1 procs for 500 steps with 7200 atoms + +Performance: 27.927 ns/day, 0.859 hours/ns, 161.612 timesteps/s, 1.164 Matom-step/s +100.0% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 2.8485 | 2.8485 | 2.8485 | 0.0 | 92.07 +Neigh | 0.18767 | 0.18767 | 0.18767 | 0.0 | 6.07 +Comm | 0.011533 | 0.011533 | 0.011533 | 0.0 | 0.37 +Output | 0.003323 | 0.003323 | 0.003323 | 0.0 | 0.11 +Modify | 0.031777 | 0.031777 | 0.031777 | 0.0 | 1.03 +Other | | 0.01107 | | | 0.36 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6409 ave 6409 max 6409 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 646408 ave 646408 max 646408 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 646408 +Ave neighs/atom = 89.778889 +Neighbor list builds = 15 +Dangerous builds = 0 + +unfix fix_piston + +run 1500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 6.6 | 6.6 | 6.6 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 1000 684.07997 636.56636 -1868.1639 144.6903 64.3068 64.3068 7734.6158 5271.3524 5272.1276 -1231.5975 + 1100 710.19886 660.87113 -1894.0485 144.6903 64.3068 64.3068 8048.3485 5396.6668 5376.5956 -1233.1774 + 1200 717.16487 667.35331 -1901.3849 144.6903 64.3068 64.3068 8009.7984 5634.5121 5349.4113 -1234.0316 + 1300 710.26037 660.92837 -1894.9802 144.6903 64.3068 64.3068 8063.4125 5572.1245 5530.174 -1234.0519 + 1400 715.93921 666.21278 -1898.8885 144.6903 64.3068 64.3068 7752.0927 5293.5463 5322.2312 -1232.6757 + 1500 748.85411 696.84154 -1926.4891 144.6903 64.3068 64.3068 6030.5428 4076.8886 4012.7653 -1229.6475 + 1600 767.98982 714.64815 -1939.8556 144.6903 64.3068 64.3068 4200.3475 2532.5711 2530.5518 -1225.2075 + 1700 757.22042 704.62675 -1925.553 144.6903 64.3068 64.3068 2686.7843 1482.2796 1505.8073 -1220.9262 + 1800 727.30327 676.78754 -1894.6635 144.6903 64.3068 64.3068 1764.2793 781.37451 801.18668 -1217.8759 + 1900 688.82146 640.97853 -1856.5007 144.6903 64.3068 64.3068 1022.805 417.32394 359.74951 -1215.5221 + 2000 655.91228 610.35509 -1823.954 144.6903 64.3068 64.3068 551.98825 -20.148643 -56.976652 -1213.5989 + 2100 620.22468 577.14622 -1789.1761 144.6903 64.3068 64.3068 264.05975 -266.8323 -314.45533 -1212.0299 + 2200 589.13325 548.21428 -1758.9252 144.6903 64.3068 64.3068 41.369707 -533.503 -525.69401 -1210.7109 + 2300 563.20394 524.08593 -1733.6036 144.6903 64.3068 64.3068 -220.99189 -810.90513 -774.65084 -1209.5176 + 2400 540.44236 502.90528 -1711.3384 144.6903 64.3068 64.3068 -358.01508 -962.31635 -977.3253 -1208.4332 + 2500 523.5718 487.20648 -1694.7088 144.6903 64.3068 64.3068 -521.87444 -1152.8386 -1231.7615 -1207.5023 +Loop time of 9.34327 on 1 procs for 1500 steps with 7200 atoms + +Performance: 27.742 ns/day, 0.865 hours/ns, 160.543 timesteps/s, 1.156 Matom-step/s +98.5% CPU use with 1 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 8.4692 | 8.4692 | 8.4692 | 0.0 | 90.65 +Neigh | 0.7512 | 0.7512 | 0.7512 | 0.0 | 8.04 +Comm | 0.031189 | 0.031189 | 0.031189 | 0.0 | 0.33 +Output | 0.010584 | 0.010584 | 0.010584 | 0.0 | 0.11 +Modify | 0.053052 | 0.053052 | 0.053052 | 0.0 | 0.57 +Other | | 0.02803 | | | 0.30 + +Nlocal: 7200 ave 7200 max 7200 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Nghost: 6380 ave 6380 max 6380 min +Histogram: 1 0 0 0 0 0 0 0 0 0 +Neighs: 515773 ave 515773 max 515773 min +Histogram: 1 0 0 0 0 0 0 0 0 0 + +Total # of neighbors = 515773 +Ave neighs/atom = 71.635139 +Neighbor list builds = 57 +Dangerous builds = 0 +Total wall time: 0:00:15 diff --git a/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 new file mode 100644 index 0000000000..d8e7c07536 --- /dev/null +++ b/examples/stress_vcm/log.19Nov24.stress_vcm.g++.4 @@ -0,0 +1,253 @@ +LAMMPS (19 Nov 2024) +OMP_NUM_THREADS environment is not set. Defaulting to 1 thread. (src/comm.cpp:99) + using 1 OpenMP thread(s) per MPI task +# Removing Binned Velocities of Center of Mass (VCM) from Stress + +# This example shows how to remove rigid body motion from +# binned stress calculations. This uses a combination of commands +# from compute chunk/atom, compute temp/chunk, compute +# stress/atom and fix ave/time. We'll show how these commands +# work in the context of a shockwave experiment on a cube of +# atoms. To shock the cube, a rectangular region of atoms is +# frozen, moved into the cube with a constant velocity along the +# x direction, and then unfrozen. As the shockwave begins +# propagating, the body of the cube also moves along the x +# direction. To better understand the stress dynamics of the +# cube we remove the velocity component belonging to the overall +# motion of each bin. + +units metal +boundary p p p +atom_style atomic +lattice fcc 5.3589 +Lattice spacing in x,y,z = 5.3589 5.3589 5.3589 +processors 1 * * + +# Defining regions for box and atoms. +# In this experiment an elongated simulation cell is +# defined in the x direction to allow for non-periodic +# motion of the atoms. + +region box1 block -3 24 0 12 0 12 units lattice +region box2 block 0 12 0 12 0 12 units lattice + +# Creating box and atoms + +create_box 1 box1 +Created orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + 1 by 2 by 2 MPI processor grid +create_atoms 1 region box2 +Created 7200 atoms + using lattice units in orthogonal box = (-16.0767 0 0) to (128.6136 64.3068 64.3068) + create_atoms CPU = 0.001 seconds + +mass 1 40.00 + +# Adding energy to the system + +velocity all create 600.0 9999 + +pair_style lj/cut 10 +pair_coeff 1 1 0.04 3.405 + +# Begin time integration + +timestep 2e-3 + +fix fix_nve all nve + +thermo 100 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Neighbor list info ... + update: every = 1 steps, delay = 0 steps, check = yes + max neighbors/atom: 2000, page size: 100000 + master list distance cutoff = 12 + ghost atom cutoff = 12 + binsize = 6, bins = 25 11 11 + 1 neighbor lists, perpetual/occasional/extra = 1 0 0 + (1) pair lj/cut/opt, perpetual + attributes: half, newton on + pair build: half/bin/atomonly/newton + stencil: half/bin/3d + bin: standard +Per MPI rank memory allocation (min/avg/max) = 3.662 | 3.662 | 3.662 Mbytes + Step Temp E_pair E_mol TotEng Press + 0 600 -2252.7567 0 -1694.4304 -974.62456 + 100 284.1896 -1976.961 0 -1712.5101 2462.6396 + 200 308.58965 -1998.6349 0 -1711.4787 1789.0033 + 300 300.55093 -1989.9838 0 -1710.308 1545.8576 + 400 297.91491 -1986.2519 0 -1709.029 1247.7121 + 500 294.66041 -1982.1097 0 -1707.9153 961.03073 +Loop time of 0.942408 on 4 procs for 500 steps with 7200 atoms + +Performance: 91.680 ns/day, 0.262 hours/ns, 530.556 timesteps/s, 3.820 Matom-step/s +82.1% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.61287 | 0.63781 | 0.65858 | 2.1 | 67.68 +Neigh | 0.030246 | 0.031529 | 0.034546 | 1.0 | 3.35 +Comm | 0.23074 | 0.25145 | 0.27819 | 3.7 | 26.68 +Output | 0.000282 | 0.0003735 | 0.000463 | 0.0 | 0.04 +Modify | 0.005566 | 0.0057635 | 0.005989 | 0.2 | 0.61 +Other | | 0.01548 | | | 1.64 + +Nlocal: 1800 ave 1814 max 1787 min +Histogram: 1 0 1 0 0 0 0 1 0 1 +Nghost: 3713.5 ave 3727 max 3699 min +Histogram: 1 0 1 0 0 0 0 1 0 1 +Neighs: 153532 ave 154995 max 152312 min +Histogram: 1 0 1 0 0 1 0 0 0 1 + +Total # of neighbors = 614128 +Ave neighs/atom = 85.295556 +Neighbor list builds = 9 +Dangerous builds = 0 + +#------------------------------------# +# Chunk, Stress, and VCM removal steps +#------------------------------------# + +# 1. Create 20 equispaced bins sliced along the x direction. +# "units reduced" normalizes the distance from 0 to 1 +variable nbins index 20 +variable fraction equal 1.0/v_nbins +variable volfrac equal 1/(vol*${fraction}) +variable volfrac equal 1/(vol*0.05) +compute ch_id all chunk/atom bin/1d x lower ${fraction} units reduced +compute ch_id all chunk/atom bin/1d x lower 0.05 units reduced + +# 2. Calculate temperature bins with VCM aka COM velocities removed. +compute ch_temp_vcm all temp/chunk ch_id com yes + +# 3. Compute per atom stress with VCM removed via temp-ID. +# The velocities from specified temp-ID are used to compute stress +# Stress/atom units are pressure*volume! Optionally handled next step. +compute atom_stress_vcm all stress/atom ch_temp_vcm + +# 4. Divide out bin volume from xx stress component. +variable stress atom -(c_atom_stress_vcm[1])/(vol*${fraction}) +variable stress atom -(c_atom_stress_vcm[1])/(vol*0.05) + +# 5. Sum the per atom stresses in each bin. +compute ch_stress_vcm all reduce/chunk ch_id sum v_stress + +# 6. Average and output to file. +# The average output is every 100 steps with samples collected 20 times with 5 step intervals +# fix ave_stress_vcm all ave/time 5 20 100 c_ch_stress_vcm mode vector file stress_xx.out + +#------------------------------------# + +# Piston compressing along x direction + +region piston block -1 1 INF INF INF INF units lattice +group piston region piston +864 atoms in group piston +fix fix_piston piston move linear 5 0 0 units box # strain rate ~ 8e10 1/s + +thermo_style custom step temp ke pe lx ly lz pxx pyy pzz econserve + +# Atom dump + +# dump atom_dump all atom 50 dump.vcm + +# # Image dumps + +# dump 2 all image 250 image.*.jpg type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 2 pad 1 + +# # Movie dump + +# dump 3 all movie 125 movie.avi type type # axes yes 0.8 0.02 view 60 -30 +# dump_modify 3 pad 1 + +run 500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +WARNING: One or more atoms are time integrated more than once (src/modify.cpp:296) +Per MPI rank memory allocation (min/avg/max) = 4.916 | 4.916 | 4.916 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 500 294.66041 274.19441 -1982.1097 144.6903 64.3068 64.3068 645.25795 1119.5337 1118.3006 -1707.9153 + 600 357.88641 333.02897 -1951.8158 144.6903 64.3068 64.3068 2176.0343 1929.2787 1981.8479 -1618.7869 + 700 418.41159 389.3503 -1912.8337 144.6903 64.3068 64.3068 3702.2875 3043.7607 3081.1607 -1523.4834 + 800 483.71102 450.11428 -1875.7955 144.6903 64.3068 64.3068 5254.3875 4190.9789 4158.3561 -1425.6813 + 900 586.0893 545.38176 -1870.9313 144.6903 64.3068 64.3068 6509.1439 4756.2216 4724.7086 -1325.5495 + 1000 686.32946 638.65962 -1874.811 144.6903 64.3068 64.3068 7515.1606 5193.049 5261.8688 -1236.1514 +Loop time of 0.656417 on 4 procs for 500 steps with 7200 atoms + +Performance: 131.624 ns/day, 0.182 hours/ns, 761.711 timesteps/s, 5.484 Matom-step/s +92.8% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 0.51672 | 0.52334 | 0.53259 | 0.8 | 79.73 +Neigh | 0.045091 | 0.045915 | 0.047402 | 0.4 | 6.99 +Comm | 0.060735 | 0.071794 | 0.079302 | 2.6 | 10.94 +Output | 0.000208 | 0.000389 | 0.000926 | 0.0 | 0.06 +Modify | 0.006007 | 0.0061595 | 0.00626 | 0.1 | 0.94 +Other | | 0.008815 | | | 1.34 + +Nlocal: 1800 ave 1811 max 1785 min +Histogram: 1 0 0 1 0 0 0 0 0 2 +Nghost: 3713.25 ave 3727 max 3702 min +Histogram: 2 0 0 0 0 0 0 1 0 1 +Neighs: 161477 ave 162958 max 159732 min +Histogram: 1 0 0 0 1 0 0 1 0 1 + +Total # of neighbors = 645909 +Ave neighs/atom = 89.709583 +Neighbor list builds = 15 +Dangerous builds = 0 + +unfix fix_piston + +run 1500 +Generated 0 of 0 mixed pair_coeff terms from geometric mixing rule +Per MPI rank memory allocation (min/avg/max) = 4.541 | 4.541 | 4.541 Mbytes + Step Temp KinEng PotEng Lx Ly Lz Pxx Pyy Pzz Econserve + 1000 686.32946 638.65962 -1874.811 144.6903 64.3068 64.3068 7515.1606 5193.049 5261.8688 -1236.1514 + 1100 709.7333 660.43791 -1898.2844 144.6903 64.3068 64.3068 7932.8638 5334.6171 5364.5335 -1237.8465 + 1200 713.27253 663.73132 -1902.4588 144.6903 64.3068 64.3068 7957.2574 5500.6231 5538.0516 -1238.7275 + 1300 705.44796 656.45022 -1895.1575 144.6903 64.3068 64.3068 7996.7265 5584.6233 5538.2494 -1238.7072 + 1400 711.86463 662.42121 -1899.8416 144.6903 64.3068 64.3068 7674.2462 5292.4915 5294.5366 -1237.4204 + 1500 742.18946 690.63979 -1924.9562 144.6903 64.3068 64.3068 6047.915 4056.6156 4014.4446 -1234.3164 + 1600 762.81764 709.83522 -1939.8563 144.6903 64.3068 64.3068 4185.5873 2530.0572 2576.1943 -1230.0211 + 1700 754.40428 702.00621 -1927.7337 144.6903 64.3068 64.3068 2662.7604 1509.1985 1484.7252 -1225.7275 + 1800 721.03504 670.95468 -1893.5556 144.6903 64.3068 64.3068 1765.8783 835.89765 861.9432 -1222.6009 + 1900 689.64162 641.74172 -1861.8886 144.6903 64.3068 64.3068 941.58148 312.93205 409.79901 -1220.1469 + 2000 650.79664 605.59477 -1823.9889 144.6903 64.3068 64.3068 543.39234 28.48735 80.396505 -1218.3941 + 2100 616.04072 573.25286 -1790.1764 144.6903 64.3068 64.3068 308.16444 -235.20997 -248.22531 -1216.9235 + 2200 587.18712 546.40333 -1761.8878 144.6903 64.3068 64.3068 37.044801 -476.50396 -470.83059 -1215.4845 + 2300 562.84178 523.74892 -1738.2239 144.6903 64.3068 64.3068 -139.28348 -711.17273 -730.80877 -1214.475 + 2400 540.48362 502.94367 -1716.3529 144.6903 64.3068 64.3068 -320.98222 -951.2066 -943.93966 -1213.4093 + 2500 519.80431 483.70067 -1696.1896 144.6903 64.3068 64.3068 -471.61317 -1088.8457 -1131.5396 -1212.4889 +Loop time of 1.97213 on 4 procs for 1500 steps with 7200 atoms + +Performance: 131.431 ns/day, 0.183 hours/ns, 760.598 timesteps/s, 5.476 Matom-step/s +95.3% CPU use with 4 MPI tasks x 1 OpenMP threads + +MPI task timing breakdown: +Section | min time | avg time | max time |%varavg| %total +--------------------------------------------------------------- +Pair | 1.5455 | 1.5599 | 1.5723 | 0.8 | 79.10 +Neigh | 0.16844 | 0.1704 | 0.17237 | 0.4 | 8.64 +Comm | 0.19002 | 0.2047 | 0.22068 | 2.4 | 10.38 +Output | 0.000525 | 0.0006785 | 0.001077 | 0.0 | 0.03 +Modify | 0.012434 | 0.012601 | 0.012777 | 0.1 | 0.64 +Other | | 0.02388 | | | 1.21 + +Nlocal: 1800 ave 1833 max 1776 min +Histogram: 1 0 1 0 1 0 0 0 0 1 +Nghost: 3702 ave 3732 max 3674 min +Histogram: 1 0 0 1 0 0 1 0 0 1 +Neighs: 129380 ave 132578 max 127003 min +Histogram: 1 0 0 2 0 0 0 0 0 1 + +Total # of neighbors = 517520 +Ave neighs/atom = 71.877778 +Neighbor list builds = 54 +Dangerous builds = 0 +Total wall time: 0:00:03 diff --git a/examples/stress_vcm/stress_comparison.19Nov24.png b/examples/stress_vcm/stress_comparison.19Nov24.png new file mode 100644 index 0000000000..c37a3ba345 Binary files /dev/null and b/examples/stress_vcm/stress_comparison.19Nov24.png differ diff --git a/examples/stress_vcm/stress_xx.19Nov24.out b/examples/stress_vcm/stress_xx.19Nov24.out new file mode 100644 index 0000000000..6e65250a96 --- /dev/null +++ b/examples/stress_vcm/stress_xx.19Nov24.out @@ -0,0 +1,423 @@ +# Time-averaged data for fix ave_stress_vcm +# TimeStep Number-of-rows +# Row c_ch_stress_vcm +600 20 +1 0 +2 -142.965 +3 2142.79 +4 12968.3 +5 -336.7 +6 2638.09 +7 4214.83 +8 3187.61 +9 -488.891 +10 -49.3553 +11 151.373 +12 -317.663 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +700 20 +1 0 +2 -14.3195 +3 -1238.9 +4 30664.3 +5 18805.2 +6 498.562 +7 930.874 +8 660.655 +9 -266.903 +10 -317.877 +11 -386.989 +12 -304.697 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +800 20 +1 0 +2 0 +3 -1656.7 +4 30424.3 +5 37003.5 +6 15562.5 +7 -2441.9 +8 -1766.09 +9 272.718 +10 -664.774 +11 -72.6933 +12 -469.765 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +900 20 +1 0 +2 0 +3 -1567.21 +4 24987.6 +5 38068.9 +6 31595 +7 8864.94 +8 -3423.99 +9 -753.063 +10 125.21 +11 -50.4895 +12 -172.14 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1000 20 +1 0 +2 0 +3 -893.168 +4 15591.6 +5 32690.6 +6 30183 +7 27172 +8 9459.75 +9 -1416.35 +10 -432.731 +11 444.323 +12 -424.357 +13 0 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1100 20 +1 0 +2 0 +3 -601.805 +4 8890.79 +5 23345.1 +6 28529.2 +7 29111.9 +8 25846.2 +9 7451.83 +10 -1624.2 +11 320.704 +12 -50.9865 +13 -5.50481 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1200 20 +1 0 +2 0 +3 1435.39 +4 8818.29 +5 7129.61 +6 20281.7 +7 28026.1 +8 28327.7 +9 26918.6 +10 8277.12 +11 -249.644 +12 -171.806 +13 -7.19065 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1300 20 +1 0 +2 0 +3 -718.118 +4 3021.9 +5 9010.51 +6 9500.87 +7 19432.8 +8 27254.3 +9 28638.5 +10 25568.5 +11 8094.66 +12 -368.293 +13 -2.20997 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1400 20 +1 0 +2 0 +3 -650.581 +4 190.19 +5 5465.38 +6 7489.23 +7 7575.16 +8 18433.5 +9 26975.3 +10 28981.5 +11 26987.9 +12 7502.07 +13 0.117312 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1500 20 +1 0 +2 0 +3 -619.311 +4 561.257 +5 461.5 +6 4105.68 +7 9272.68 +8 10445.6 +9 18826.1 +10 25434.8 +11 25653.8 +12 10981.2 +13 33.682 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1600 20 +1 0 +2 0 +3 -349.345 +4 513.579 +5 -471.384 +6 1257.81 +7 7122.9 +8 8659.35 +9 8452.08 +10 16013.5 +11 17091 +12 5476.24 +13 -136.183 +14 0 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1700 20 +1 0 +2 0 +3 -273.839 +4 -907.407 +5 -272.136 +6 594.363 +7 3302.77 +8 5564.07 +9 8689.92 +10 6446.06 +11 1779.37 +12 338.998 +13 -171.408 +14 -1.21548 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1800 20 +1 0 +2 0 +3 -164.819 +4 383.877 +5 -140.681 +6 -10.0153 +7 907.937 +8 3269.05 +9 5325.22 +10 395.73 +11 -4103.73 +12 -2787.16 +13 -1357.04 +14 -35.2044 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +1900 20 +1 0 +2 0 +3 -80.813 +4 334.225 +5 248.55 +6 82.0566 +7 207.763 +8 185.714 +9 -55.8635 +10 -2758.51 +11 -4619.33 +12 -5521.92 +13 -2346.36 +14 -415.324 +15 0 +16 0 +17 0 +18 0 +19 0 +20 0 +2000 20 +1 0 +2 0 +3 -83.1832 +4 264.023 +5 596.087 +6 40.8157 +7 -267.093 +8 -2288.15 +9 -3387.64 +10 -5566.79 +11 -5640.76 +12 -4925.74 +13 -3096.01 +14 -757.817 +15 -1.13042 +16 0 +17 0 +18 0 +19 0 +20 0 +2100 20 +1 0 +2 0 +3 -17.4378 +4 62.1251 +5 740.988 +6 357.467 +7 -1137.61 +8 -4266.83 +9 -4962.9 +10 -5322.45 +11 -5437.58 +12 -4846.56 +13 -3651.28 +14 -1151.01 +15 -28.3074 +16 0 +17 0 +18 0 +19 0 +20 0 +2200 20 +1 0 +2 0 +3 -10.8779 +4 -56.7926 +5 400.261 +6 -568.63 +7 -2193.36 +8 -3856.71 +9 -6603 +10 -5717.11 +11 -4868.64 +12 -4173.5 +13 -3402.64 +14 -1712.44 +15 -80.6771 +16 -0.123189 +17 0 +18 0 +19 0 +20 0 +2300 20 +1 0 +2 0 +3 -22.8402 +4 -44.5496 +5 -365.476 +6 -1285.6 +7 -2887.76 +8 -4022.77 +9 -6280.86 +10 -6055.26 +11 -4921.51 +12 -4445.37 +13 -3531.69 +14 -1360.49 +15 -258.99 +16 0.196931 +17 0 +18 0 +19 0 +20 0 +2400 20 +1 0 +2 0 +3 -0.594396 +4 -148.921 +5 -1118.18 +6 -2071.85 +7 -3989.41 +8 -4567.01 +9 -4939.36 +10 -5170.94 +11 -4922.25 +12 -4587.5 +13 -3748.19 +14 -1785.46 +15 -460.491 +16 2.54038 +17 0 +18 0 +19 0 +20 0 +2500 20 +1 0 +2 0 +3 5.64755 +4 -485.854 +5 -2525.68 +6 -2642.35 +7 -5066.15 +8 -4546.03 +9 -4429.45 +10 -4579.15 +11 -4829.56 +12 -4384.77 +13 -3525.99 +14 -1708.9 +15 -627.176 +16 -23.5581 +17 0 +18 0 +19 0 +20 0 diff --git a/fortran/lammps.f90 b/fortran/lammps.f90 index 1cc6992c27..552b3dfad3 100644 --- a/fortran/lammps.f90 +++ b/fortran/lammps.f90 @@ -126,6 +126,7 @@ MODULE LIBLAMMPS PROCEDURE :: set_variable => lmp_set_variable PROCEDURE :: set_string_variable => lmp_set_string_variable PROCEDURE :: set_internal_variable => lmp_set_internal_variable + PROCEDURE :: eval => lmp_eval PROCEDURE, PRIVATE :: lmp_gather_atoms_int PROCEDURE, PRIVATE :: lmp_gather_atoms_double GENERIC :: gather_atoms => lmp_gather_atoms_int, & @@ -618,7 +619,14 @@ MODULE LIBLAMMPS INTEGER(c_int) :: lammps_set_internal_variable END FUNCTION lammps_set_internal_variable - SUBROUTINE lammps_gather_atoms(handle, name, type, count, data) BIND(C) + FUNCTION lammps_eval(handle, expr) BIND(C) + IMPORT :: c_ptr, c_double + IMPLICIT NONE + TYPE(c_ptr), VALUE :: handle, expr + REAL(c_double) :: lammps_eval + END FUNCTION lammps_eval + + SUBROUTINE lammps_gather_atoms(handle, name, TYPE, count, DATA) BIND(C) IMPORT :: c_int, c_ptr IMPLICIT NONE TYPE(c_ptr), VALUE :: handle, name, data @@ -1812,7 +1820,7 @@ CONTAINS SUBROUTINE lmp_set_internal_variable(self, name, val) CLASS(lammps), INTENT(IN) :: self CHARACTER(LEN=*), INTENT(IN) :: name - REAL(KIND=c_double), INTENT(IN) :: val + REAL(c_double), INTENT(IN) :: val INTEGER :: err TYPE(c_ptr) :: Cname @@ -1826,6 +1834,18 @@ CONTAINS END IF END SUBROUTINE lmp_set_internal_variable + ! equivalent function to lammps_eval + FUNCTION lmp_eval(self, expr) + CLASS(lammps), INTENT(IN) :: self + CHARACTER(LEN=*), INTENT(IN) :: expr + REAL(c_double) :: lmp_eval + TYPE(c_ptr) :: Cexpr + + Cexpr = f2c_string(expr) + lmp_eval = lammps_eval(self%handle, Cexpr) + CALL lammps_free(Cexpr) + END FUNCTION lmp_eval + ! equivalent function to lammps_gather_atoms (for integers) SUBROUTINE lmp_gather_atoms_int(self, name, count, data) CLASS(lammps), INTENT(IN) :: self diff --git a/lib/kokkos/CHANGELOG.md b/lib/kokkos/CHANGELOG.md index 7b1d69e566..84bbd03585 100644 --- a/lib/kokkos/CHANGELOG.md +++ b/lib/kokkos/CHANGELOG.md @@ -1,7 +1,112 @@ # CHANGELOG +## 4.5.01 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.5.00...4.5.01) + +### Bug Fixes + +* Fix re-builds after cleaning the binary tree when doing `add_subdirectory` on the Kokkos source [\#7557](https://github.com/kokkos/kokkos/pull/7557) +* Update mdspan to include fix for submdspan and bracket operator with clang 15&16 [\#7559](https://github.com/kokkos/kokkos/pull/7559) +* Fix DynRankView performance regression by re-introducing shortcut operator() impls [\#7606](https://github.com/kokkos/kokkos/pull/7606) +* Add missing MI300A (`GFX942_APU`) option to Makefile build-system + +## 4.5.00 + +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.01...4.5.00) + +### Features + +* SYCL backend graduated to production ready +* Introduce new `SequentialHostInit` view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) (backported in 4.4.01) +* Support building with Run-Time Type Information (RTTI) disabled +* Add new `KOKKOS_RELOCATABLE_FUNCTION` function annotation macro [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +### Backend and Architecture Enhancements + +#### CUDA + +* Adding occupancy tuning for CUDA architectures [\#6788](https://github.com/kokkos/kokkos/pull/6788) +* By default disable `cudaMallocAsync` (i.e., revert the change made in version 4.2) [\#7353](https://github.com/kokkos/kokkos/pull/7353) + +#### HIP + +* Add support for AMD Phoenix APUs with Radeon 740M/760M/780M/880M/890M [\#7162](https://github.com/kokkos/kokkos/pull/7162) +* Update maximum waves per CU values for consumer card [\#7347](https://github.com/kokkos/kokkos/pull/7347) +* Check that Kokkos is running on the architecture it was compiled for [\#7379](https://github.com/kokkos/kokkos/pull/7379) +* Add opt-in option to use `hipMallocAsync` instead of `hipMalloc` [\#7324](https://github.com/kokkos/kokkos/pull/7324) +* Introduce new architecture option `AMD_GFX942_APU` for MI300A [\#7462](https://github.com/kokkos/kokkos/pull/7462) + +#### SYCL + +* Move the `SYCL` backend out of the `Experimental` namespace [\#7171](https://github.com/kokkos/kokkos/pull/7171) +* Introduce `KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE` as CMake option [\#5993](https://github.com/kokkos/kokkos/pull/5993) + +#### OpenACC + +* Add support for building with the Clacc compiler [\#7198](https://github.com/kokkos/kokkos/pull/7198) +* Workaround NVHPC collapse clause bug for `MDRangePolicy` [\#7425](https://github.com/kokkos/kokkos/pull/7425) + +#### HPX + +* Implement `Experimental::partition_space` to produce truly independent execution spaces [\#7287](https://github.com/kokkos/kokkos/pull/7287) + +#### Threads + +* Fix compilation for `parallel_reduce` `MDRange` with `Dynamic` scheduling [\#7478](https://github.com/kokkos/kokkos/pull/7478) +* Fix race conditions on ARM architectures [\#7498](https://github.com/kokkos/kokkos/pull/7498) + +#### OpenMP + +* Fix run time behavior when compiling with `-fvisibility-hidden` [\#7284](https://github.com/kokkos/kokkos/pull/7284) (backported in 4.4.01) +* Fix linking with Cray Clang compiler [\#7341](https://github.com/kokkos/kokkos/pull/7341) + +#### Serial + +* Allow `Kokkos_ENABLE_ATOMICS_BYPASS` to skip mutexes to remediate performance regression in 4.4 [\#7369](https://github.com/kokkos/kokkos/pull/7369) + +### General Enhancements + +* Improve `View` initialization/destruction for non-scalar trivial and trivially-destructible types [\#7219](https://github.com/kokkos/kokkos/pull/7219) [\#7225](https://github.com/kokkos/kokkos/pull/7225) +* Add getters for default tile sizes used in `MDRangePolicy` [\#6839](https://github.com/kokkos/kokkos/pull/6839) +* Improve performance of `Kokkos::sort` when `std::sort` is used [\#7264](https://github.com/kokkos/kokkos/pull/7264) +* Add range-based for loop support for `Array` [\#7293](https://github.com/kokkos/kokkos/pull/7293) +* Allow functors as reducers for nested team parallel reduce [\#6921](https://github.com/kokkos/kokkos/pull/6921) +* Avoid making copies of string rvalue reference arguments to `view_alloc()` [\#7364](https://github.com/kokkos/kokkos/pull/7364) +* Add `atomic_{mod,xor,nand,lshift,rshift}` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Allow using `SequentialHostInit` with `Kokkos::DualView` [\#7456](https://github.com/kokkos/kokkos/pull/7456) +* Add `Graph::instantiate()` [\#7240](https://github.com/kokkos/kokkos/pull/7240) +* Allow an arbitrary execution space instance to be used in `Kokkos::Graph::submit()` [\#7249](https://github.com/kokkos/kokkos/pull/7249) +* Enable compile-time diagnostic of illegal reduction target for graphs [\#7460](https://github.com/kokkos/kokkos/pull/7460) + +### Build System Changes + +* Make sure backend-specific options such as `IMPL_CUDA_MALLOC_ASYNC` only show when that backend is actually enabled [\#7228](https://github.com/kokkos/kokkos/pull/7228) +* Major refactoring removing `TriBITS` paths [\#6164](https://github.com/kokkos/kokkos/pull/6164) +* Add support for SpacemiT K60 (RISC-V) [\#7160](https://github.com/kokkos/kokkos/pull/7160) + +### Deprecations + +* Deprecate Tasking interface [\#7393](https://github.com/kokkos/kokkos/pull/7393) +* Deprecate `atomic_query_version`, `atomic_assign`, `atomic_compare_exchange_strong`, `atomic_{inc, dec}rement` [\#7458](https://github.com/kokkos/kokkos/pull/7458) +* Deprecate `{OpenMP,HPX}::is_asynchronous()` [\#7322](https://github.com/kokkos/kokkos/pull/7322) + +### Bug Fixes + +* Fix undefined behavior in `BinSort` when sorting within bins on host [\#7223](https://github.com/kokkos/kokkos/pull/7223) +* Using CUDA limits to set extents for blocks, grids [\#7235](https://github.com/kokkos/kokkos/pull/7235) +* Fix `deep_copy (serial_exec, dst, src)` with multiple host backends [\#7245](https://github.com/kokkos/kokkos/pull/7245) +* Skip `RangePolicy` bounds conversion checks if roundtrip convertibility is not provided [\#7172](https://github.com/kokkos/kokkos/pull/7172) +* Allow extracting host and device views from `DualView` with `const` value type [\#7242](https://github.com/kokkos/kokkos/pull/7242) +* Fix `TeamPolicy` array reduction for CUDA and HIP [\#6296](https://github.com/kokkos/kokkos/pull/6296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix configuring without architecture flags for SYCL [\#7303](https://github.com/kokkos/kokkos/pull/7303) +* Set an initial value index during join of `MinLoc`, `MaxLoc` or `MinMaxLoc` [\#7330](https://github.com/kokkos/kokkos/pull/7330) +* Fix storage lifetime of driver for global launch of graph nodes for CUDA and HIP [\#7365](https://github.com/kokkos/kokkos/pull/7365) +* Make `value_type` for `RandomAccessIterator` non-`const` [\#7485](https://github.com/kokkos/kokkos/pull/7485) + ## [4.4.01](https://github.com/kokkos/kokkos/tree/4.4.01) -[Full Changelog](https://github.com/kokkos/kokkos/compare/4.0.00...4.4.01) +[Full Changelog](https://github.com/kokkos/kokkos/compare/4.4.00...4.4.01) ### Features: * Introduce new SequentialHostInit view allocation property [\#7229](https://github.com/kokkos/kokkos/pull/7229) @@ -13,7 +118,7 @@ ### Bug Fixes * OpenMP: Fix issue related to the visibility of an internal symbol with shared libraries that affected `ScatterView` in particular [\#7284](https://github.com/kokkos/kokkos/pull/7284) -* Fix implicit copy assignment operators in few AVX2 masks being deleted [#7296](https://github.com/kokkos/kokkos/pull/7296) +* Fix implicit copy assignment operators in few AVX2 masks being deleted [\#7296](https://github.com/kokkos/kokkos/pull/7296) ## [4.4.00](https://github.com/kokkos/kokkos/tree/4.4.00) [Full Changelog](https://github.com/kokkos/kokkos/compare/4.3.01...4.4.00) @@ -57,6 +162,7 @@ * SIMD: Allow flexible vector width for 32 bit types [\#6802](https://github.com/kokkos/kokkos/pull/6802) * Updates for `Kokkos::Array`: add `kokkos_swap(Array)` specialization [\#6943](https://github.com/kokkos/kokkos/pull/6943), add `Kokkos::to_array` [\#6375](https://github.com/kokkos/kokkos/pull/6375), make `Kokkos::Array` equality-comparable [\#7148](https://github.com/kokkos/kokkos/pull/7148) * Structured binding support for `Kokkos::complex` [\#7040](https://github.com/kokkos/kokkos/pull/7040) +* Introduce `KOKKOS_DEDUCTION_GUIDE` macro to allow for portable user-defined deduction guides [\#6954](https://github.com/kokkos/kokkos/pull/6954) ### Build System Changes * Do not require OpenMP support for languages other than CXX [\#6965](https://github.com/kokkos/kokkos/pull/6965) @@ -1388,7 +1494,7 @@ **Closed issues:** - Silent error (Validate storage level arg to set_scratch_size) [\#3097](https://github.com/kokkos/kokkos/issues/3097) -- Remove KOKKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) +- Remove KOKKOS\_ENABLE\_PROFILING Option [\#3095](https://github.com/kokkos/kokkos/issues/3095) - Cuda 11 -\> allow C++17 [\#3083](https://github.com/kokkos/kokkos/issues/3083) - In source build failure not explained [\#3081](https://github.com/kokkos/kokkos/issues/3081) - Allow naming of Views for initialization kernel [\#3070](https://github.com/kokkos/kokkos/issues/3070) diff --git a/lib/kokkos/CMakeLists.txt b/lib/kokkos/CMakeLists.txt index 736cbac218..6a70bea149 100644 --- a/lib/kokkos/CMakeLists.txt +++ b/lib/kokkos/CMakeLists.txt @@ -1,12 +1,11 @@ cmake_minimum_required(VERSION 3.16 FATAL_ERROR) # Disable in-source builds to prevent source tree corruption. -if( "${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}" ) - message( FATAL_ERROR "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." ) -endif() - -if (COMMAND TRIBITS_PACKAGE) - TRIBITS_PACKAGE(Kokkos) +if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_BINARY_DIR}") + message( + FATAL_ERROR + "FATAL: In-source builds are not allowed. You should create a separate directory for build files and delete CMakeCache.txt." + ) endif() # We want to determine if options are given with the wrong case @@ -15,142 +14,141 @@ endif() # form a list of all the given variables. If it begins with any # case of KoKkOS, we add it to the list. -GET_CMAKE_PROPERTY(_variableNames VARIABLES) -SET(KOKKOS_GIVEN_VARIABLES) -FOREACH (var ${_variableNames}) - STRING(TOUPPER ${var} UC_VAR) - STRING(FIND ${UC_VAR} KOKKOS IDX) - IF (${IDX} EQUAL 0) - LIST(APPEND KOKKOS_GIVEN_VARIABLES ${var}) - ENDIF() -ENDFOREACH() +get_cmake_property(_variableNames VARIABLES) +set(KOKKOS_GIVEN_VARIABLES) +foreach(var ${_variableNames}) + string(TOUPPER ${var} UC_VAR) + string(FIND ${UC_VAR} KOKKOS IDX) + if(${IDX} EQUAL 0) + list(APPEND KOKKOS_GIVEN_VARIABLES ${var}) + endif() +endforeach() # Basic initialization (Used in KOKKOS_SETTINGS) -SET(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) -SET(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) -SET(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) +set(Kokkos_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}) +set(KOKKOS_SRC_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_PATH ${Kokkos_SOURCE_DIR}) +set(KOKKOS_TOP_BUILD_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +set(PACKAGE_NAME Kokkos) +set(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -# Is this a build as part of Trilinos? -IF(COMMAND TRIBITS_PACKAGE_DECL) - SET(KOKKOS_HAS_TRILINOS ON) -ELSE() - SET(KOKKOS_HAS_TRILINOS OFF) - SET(PACKAGE_NAME Kokkos) - SET(PACKAGE_SOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}") -ENDIF() # Is this build a subdirectory of another project -GET_DIRECTORY_PROPERTY(HAS_PARENT PARENT_DIRECTORY) +get_directory_property(HAS_PARENT PARENT_DIRECTORY) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_functions.cmake) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_pick_cxx_std.cmake) - -SET(KOKKOS_ENABLED_OPTIONS) #exported in config file -SET(KOKKOS_ENABLED_DEVICES) #exported in config file -SET(KOKKOS_ENABLED_TPLS) #exported in config file -SET(KOKKOS_ENABLED_ARCH_LIST) #exported in config file +set(KOKKOS_ENABLED_OPTIONS) #exported in config file +set(KOKKOS_ENABLED_DEVICES) #exported in config file +set(KOKKOS_ENABLED_TPLS) #exported in config file +set(KOKKOS_ENABLED_ARCH_LIST) #exported in config file #These are helper flags used for sanity checks during config #Certain features should depend on other features being configured first -SET(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies -SET(KOKKOS_CFG_DAG_DEVICES_DONE Off) -SET(KOKKOS_CFG_DAG_OPTIONS_DONE Off) -SET(KOKKOS_CFG_DAG_ARCH_DONE Off) -SET(KOKKOS_CFG_DAG_CXX_STD_DONE Off) -SET(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) -FUNCTION(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) - SET(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) - SET(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) - IF (NOT ${PRE_FLAG}) - MESSAGE(FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured") - ENDIF() - GLOBAL_SET(${POST_FLAG} On) -ENDFUNCTION() +set(KOKKOS_CFG_DAG_NONE On) #sentinel to indicate no dependencies +set(KOKKOS_CFG_DAG_DEVICES_DONE Off) +set(KOKKOS_CFG_DAG_OPTIONS_DONE Off) +set(KOKKOS_CFG_DAG_ARCH_DONE Off) +set(KOKKOS_CFG_DAG_CXX_STD_DONE Off) +set(KOKKOS_CFG_DAG_COMPILER_ID_DONE Off) +function(KOKKOS_CFG_DEPENDS SUCCESSOR PRECURSOR) + set(PRE_FLAG KOKKOS_CFG_DAG_${PRECURSOR}) + set(POST_FLAG KOKKOS_CFG_DAG_${SUCCESSOR}) + if(NOT ${PRE_FLAG}) + message( + FATAL_ERROR "Bad CMake refactor: feature ${SUCCESSOR} cannot be configured until ${PRECURSOR} is configured" + ) + endif() + global_set(${POST_FLAG} On) +endfunction() +list(APPEND CMAKE_MODULE_PATH cmake/Modules) -LIST(APPEND CMAKE_MODULE_PATH cmake/Modules) +set(CMAKE_DISABLE_SOURCE_CHANGES ON) +set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) -IF(NOT KOKKOS_HAS_TRILINOS) - set(CMAKE_DISABLE_SOURCE_CHANGES ON) - set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) +# What language are we compiling Kokkos as +# downstream dependencies need to match this! +set(KOKKOS_COMPILE_LANGUAGE CXX) +# use lower case here since we didn't parse options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) - # What language are we compiling Kokkos as - # downstream dependencies need to match this! - SET(KOKKOS_COMPILE_LANGUAGE CXX) - # use lower case here since we didn't parse options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_CUDA) + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as CUDA only + # because otherwise the C++ features don't work etc. + # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even + # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 + # days. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as CUDA only - # because otherwise the C++ features don't work etc. - # This is just the rather odd way CMake does this, since CUDA doesn't imply C++ even - # though it is a C++ extension ... (but I guess it didn't use to be back in CUDA 4 or 5 - # days. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE CUDA) +endif() +# use lower case here since we haven't parsed options yet +if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) - SET(KOKKOS_COMPILE_LANGUAGE CUDA) - ENDIF() - # use lower case here since we haven't parsed options yet - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE AND Kokkos_ENABLE_HIP) + # Without this as a language for the package we would get a C++ compiler enabled. + # but we still need a C++ compiler even if we build all our cpp files as HIP only + # because otherwise the C++ features don't work etc. + set(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) - # Without this as a language for the package we would get a C++ compiler enabled. - # but we still need a C++ compiler even if we build all our cpp files as HIP only - # because otherwise the C++ features don't work etc. - SET(KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE CXX) + set(KOKKOS_COMPILE_LANGUAGE HIP) +endif() - SET(KOKKOS_COMPILE_LANGUAGE HIP) - ENDIF() +if(Spack_WORKAROUND) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + message(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") + endif() - IF (Spack_WORKAROUND) - IF (Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - MESSAGE(FATAL_ERROR "Can't currently use Kokkos_ENABLE_COMPILER_AS_CMAKE_LANGUAGE in a spack installation!") - ENDIF() - - #if we are explicitly using Spack for development, - #nuke the Spack compiler - SET(SPACK_CXX $ENV{SPACK_CXX}) - IF(SPACK_CXX) - SET(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) - SET(ENV{CXX} ${SPACK_CXX}) - ENDIF() - ENDIF() - # Always call the project command to define Kokkos_ variables - # and to make sure that C++ is an enabled language - PROJECT(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) - IF(NOT HAS_PARENT) - IF (NOT CMAKE_BUILD_TYPE) - SET(DEFAULT_BUILD_TYPE "RelWithDebInfo") - MESSAGE(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") - SET(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE STRING - "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." - FORCE) - ENDIF() - ENDIF() -ELSE() - SET(KOKKOS_COMPILE_LANGUAGE CXX) -ENDIF() - -IF (NOT CMAKE_SIZEOF_VOID_P) - STRING(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) - IF (NOT FIND_IDX STREQUAL -1) - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation") - ENDIF() -ELSEIF (NOT CMAKE_SIZEOF_VOID_P EQUAL 8) - IF(CMAKE_SIZEOF_VOID_P EQUAL 4) - MESSAGE(WARNING "32-bit builds are experimental and not officially supported.") - SET(KOKKOS_IMPL_32BIT ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;") - ENDIF() -ENDIF() + #if we are explicitly using Spack for development, + #nuke the Spack compiler + set(SPACK_CXX $ENV{SPACK_CXX}) + if(SPACK_CXX) + set(CMAKE_CXX_COMPILER ${SPACK_CXX} CACHE STRING "the C++ compiler" FORCE) + set(ENV{CXX} ${SPACK_CXX}) + endif() +endif() +# Always call the project command to define Kokkos_ variables +# and to make sure that C++ is an enabled language +project(Kokkos ${KOKKOS_COMPILE_LANGUAGE} ${KOKKOS_INTERNAL_EXTRA_COMPILE_LANGUAGE}) +if(NOT HAS_PARENT) + if(NOT CMAKE_BUILD_TYPE) + set(DEFAULT_BUILD_TYPE "RelWithDebInfo") + message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' as none was specified.") + set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" + CACHE STRING "Choose the type of build, options are: Debug, Release, RelWithDebInfo and MinSizeRel." FORCE + ) + endif() +endif() +if(NOT CMAKE_SIZEOF_VOID_P) + string(FIND ${CMAKE_CXX_COMPILER} nvcc_wrapper FIND_IDX) + if(NOT FIND_IDX STREQUAL -1) + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is CUDA linkage using nvcc_wrapper. Please ensure your CUDA environment is correctly configured." + ) + else() + message( + FATAL_ERROR + "Kokkos did not configure correctly and failed to validate compiler. The most likely cause is linkage errors during CMake compiler validation. Please consult the CMake error log shown below for the exact error during compiler validation" + ) + endif() +elseif(NOT CMAKE_SIZEOF_VOID_P EQUAL 8) + if(CMAKE_SIZEOF_VOID_P EQUAL 4) + message(WARNING "32-bit builds are experimental and not officially supported.") + set(KOKKOS_IMPL_32BIT ON) + else() + message( + FATAL_ERROR + "Kokkos assumes a 64-bit build, i.e., 8-byte pointers, but found ${CMAKE_SIZEOF_VOID_P}-byte pointers instead;" + ) + endif() +endif() set(Kokkos_VERSION_MAJOR 4) -set(Kokkos_VERSION_MINOR 4) +set(Kokkos_VERSION_MINOR 5) set(Kokkos_VERSION_PATCH 1) set(Kokkos_VERSION "${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR}.${Kokkos_VERSION_PATCH}") message(STATUS "Kokkos version: ${Kokkos_VERSION}") @@ -164,58 +162,54 @@ math(EXPR KOKKOS_VERSION_PATCH "${KOKKOS_VERSION} % 100") # Load either the real TriBITS or a TriBITS wrapper # for certain utility functions that are universal (like GLOBAL_SET) -INCLUDE(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) +include(${KOKKOS_SRC_PATH}/cmake/fake_tribits.cmake) -IF (Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # If we are building CUDA, we have tricked CMake because we declare a CXX project # If the default C++ standard for a given compiler matches the requested # standard, then CMake just omits the -std flag in later versions of CMake # This breaks CUDA compilation (CUDA compiler can have a different default # -std then the underlying host compiler by itself). Setting this variable # forces CMake to always add the -std flag even if it thinks it doesn't need it - GLOBAL_SET(CMAKE_CXX_STANDARD_DEFAULT 98) -ENDIF() + global_set(CMAKE_CXX_STANDARD_DEFAULT 98) +endif() # These are the variables we will append to as we go # I really wish these were regular variables # but scoping issues can make it difficult -GLOBAL_SET(KOKKOS_COMPILE_OPTIONS) -GLOBAL_SET(KOKKOS_LINK_OPTIONS) -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) -GLOBAL_SET(KOKKOS_CUDAFE_OPTIONS) -GLOBAL_SET(KOKKOS_XCOMPILER_OPTIONS) +global_set(KOKKOS_COMPILE_OPTIONS) +global_set(KOKKOS_LINK_OPTIONS) +global_set(KOKKOS_AMDGPU_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDAFE_OPTIONS) +global_set(KOKKOS_XCOMPILER_OPTIONS) # We need to append text here for making sure TPLs # we import are available for an installed Kokkos -GLOBAL_SET(KOKKOS_TPL_EXPORTS) +global_set(KOKKOS_TPL_EXPORTS) # KOKKOS_DEPENDENCE is used by kokkos_launch_compiler -GLOBAL_SET(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) +global_set(KOKKOS_COMPILE_DEFINITIONS KOKKOS_DEPENDENCE) # MSVC never goes through kokkos_launch_compiler -IF(NOT MSVC) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) -ENDIF() +if(NOT MSVC) + global_append(KOKKOS_LINK_OPTIONS -DKOKKOS_DEPENDENCE) +endif() -IF(Kokkos_ENABLE_TESTS AND NOT KOKKOS_HAS_TRILINOS) +include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/kokkos_configure_trilinos.cmake) + +if(Kokkos_ENABLE_TESTS) find_package(GTest QUIET) -ENDIF() +endif() # Include a set of Kokkos-specific wrapper functions that # will either call raw CMake or TriBITS # These are functions like KOKKOS_INCLUDE_DIRECTORIES -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) - +include(${KOKKOS_SRC_PATH}/cmake/kokkos_tribits.cmake) # Check the environment and set certain variables # to allow platform-specific checks -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) +include(${KOKKOS_SRC_PATH}/cmake/kokkos_check_env.cmake) -IF(NOT KOKKOS_HAS_TRILINOS) - # This does not work in Trilinos and we simply don't care - # to fix it for Trilinos - # Gather information about the runtime environment - INCLUDE(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) - check_git_setup() -ENDIF() +include(${KOKKOS_SRC_PATH}/cmake/build_env_info.cmake) +check_git_setup() # The build environment setup goes in the following steps # 1) Check all the enable options. This includes checking Kokkos_DEVICES @@ -223,102 +217,54 @@ ENDIF() # 3) Check the CXX standard and select important CXX flags # 4) Check for any third-party libraries (TPLs) like hwloc # 5) Check if optimizing for a particular architecture and add arch-specific flags -KOKKOS_SETUP_BUILD_ENVIRONMENT() +kokkos_setup_build_environment() # Finish off the build # 6) Recurse into subdirectories and configure individual libraries # 7) Export and install targets -OPTION(BUILD_SHARED_LIBS "Build shared libraries" OFF) +option(BUILD_SHARED_LIBS "Build shared libraries" OFF) -SET(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) -SET_PROPERTY(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) +set(KOKKOS_COMPONENT_LIBRARIES kokkoscore kokkoscontainers kokkosalgorithms kokkossimd) +set_property(GLOBAL PROPERTY KOKKOS_INT_LIBRARIES kokkos ${KOKKOS_COMPONENT_LIBRARIES}) -IF (KOKKOS_HAS_TRILINOS) - SET(TRILINOS_INCDIR ${${PROJECT_NAME}_INSTALL_INCLUDE_DIR}) - SET(KOKKOS_HEADER_DIR ${TRILINOS_INCDIR}) - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSEIF(HAS_PARENT) - SET(KOKKOS_HEADER_DIR "include/kokkos") - SET(KOKKOS_IS_SUBDIRECTORY TRUE) -ELSE() - SET(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") - SET(KOKKOS_IS_SUBDIRECTORY FALSE) -ENDIF() +if(HAS_PARENT) + set(KOKKOS_HEADER_DIR "include/kokkos") + set(KOKKOS_IS_SUBDIRECTORY TRUE) +else() + set(KOKKOS_HEADER_DIR "${CMAKE_INSTALL_INCLUDEDIR}") + set(KOKKOS_IS_SUBDIRECTORY FALSE) +endif() #------------------------------------------------------------------------------ # # A) Forward declare the package so that certain options are also defined for # subpackages -## This restores the old behavior of ProjectCompilerPostConfig.cmake -# We must do this before KOKKOS_PACKAGE_DECL -IF (KOKKOS_HAS_TRILINOS) - # Overwrite the old flags at the top-level - # Because Tribits doesn't use lists, it uses spaces for the list of CXX flags - # we have to match the annoying behavior, also we have to preserve quotes - # which needs another workaround. - SET(KOKKOS_COMPILE_OPTIONS_TMP) - IF (KOKKOS_ENABLE_HIP) - LIST(APPEND KOKKOS_COMPILE_OPTIONS ${KOKKOS_AMDGPU_OPTIONS}) - ENDIF() - FOREACH(OPTION ${KOKKOS_COMPILE_OPTIONS}) - STRING(FIND "${OPTION}" " " OPTION_HAS_WHITESPACE) - IF(OPTION_HAS_WHITESPACE EQUAL -1) - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "${OPTION}") - ELSE() - LIST(APPEND KOKKOS_COMPILE_OPTIONS_TMP "\"${OPTION}\"") - ENDIF() - ENDFOREACH() - STRING(REPLACE ";" " " KOKKOSCORE_COMPILE_OPTIONS "${KOKKOS_COMPILE_OPTIONS_TMP}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_COMPILE_OPTIONS}) - IF (KOKKOS_ENABLE_CUDA) - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS ${KOKKOS_CUDA_OPTIONS}) - ENDIF() - FOREACH(XCOMP_FLAG ${KOKKOS_XCOMPILER_OPTIONS}) - SET(KOKKOSCORE_XCOMPILER_OPTIONS "${KOKKOSCORE_XCOMPILER_OPTIONS} -Xcompiler ${XCOMP_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcompiler ${XCOMP_FLAG}) - ENDFOREACH() - IF (KOKKOS_ENABLE_CUDA) - STRING(REPLACE ";" " " KOKKOSCORE_CUDA_OPTIONS "${KOKKOS_CUDA_OPTIONS}") - FOREACH(CUDAFE_FLAG ${KOKKOS_CUDAFE_OPTIONS}) - SET(KOKKOSCORE_CUDAFE_OPTIONS "${KOKKOSCORE_CUDAFE_OPTIONS} -Xcudafe ${CUDAFE_FLAG}") - LIST(APPEND KOKKOS_ALL_COMPILE_OPTIONS -Xcudafe ${CUDAFE_FLAG}) - ENDFOREACH() - ENDIF() - #These flags get set up in KOKKOS_PACKAGE_DECL, which means they - #must be configured before KOKKOS_PACKAGE_DECL - SET(KOKKOS_ALL_COMPILE_OPTIONS - $<$:${KOKKOS_ALL_COMPILE_OPTIONS}>) -ENDIF() - - #------------------------------------------------------------------------------ # # D) Process the subpackages (subdirectories) for Kokkos # -KOKKOS_PROCESS_SUBPACKAGES() - +kokkos_process_subpackages() #------------------------------------------------------------------------------ # # E) If Kokkos itself is enabled, process the Kokkos package # -KOKKOS_PACKAGE_POSTPROCESS() -KOKKOS_CONFIGURE_CORE() +kokkos_configure_core() -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - ADD_LIBRARY(kokkos INTERFACE) +if(NOT Kokkos_INSTALL_TESTING) + add_library(kokkos INTERFACE) #Make sure in-tree projects can reference this as Kokkos:: #to match the installed target names - ADD_LIBRARY(Kokkos::kokkos ALIAS kokkos) + add_library(Kokkos::kokkos ALIAS kokkos) # all_libs target is required for TriBITS-compliance - ADD_LIBRARY(Kokkos::all_libs ALIAS kokkos) - TARGET_LINK_LIBRARIES(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(kokkos) -ENDIF() -INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) + add_library(Kokkos::all_libs ALIAS kokkos) + target_link_libraries(kokkos INTERFACE ${KOKKOS_COMPONENT_LIBRARIES}) + kokkos_internal_add_library_install(kokkos) +endif() +include(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # nvcc_wrapper is Kokkos' wrapper for NVIDIA's NVCC CUDA compiler. # Kokkos needs nvcc_wrapper in order to build. Other libraries and @@ -327,16 +273,15 @@ INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_install.cmake) # as relative to ${CMAKE_INSTALL_PATH}. # KOKKOS_INSTALL_ADDITIONAL_FILES will install nvcc wrapper and other generated # files -KOKKOS_INSTALL_ADDITIONAL_FILES() - +kokkos_install_additional_files() # Finally - if we are a subproject - make sure the enabled devices are visible -IF (HAS_PARENT) - FOREACH(DEV Kokkos_ENABLED_DEVICES) +if(HAS_PARENT) + foreach(DEV Kokkos_ENABLED_DEVICES) #I would much rather not make these cache variables or global properties, but I can't #make any guarantees on whether PARENT_SCOPE is good enough to make #these variables visible where I need them - SET(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) - SET_PROPERTY(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) - ENDFOREACH() -ENDIF() + set(Kokkos_ENABLE_${DEV} ON PARENT_SCOPE) + set_property(GLOBAL PROPERTY Kokkos_ENABLE_${DEV} ON) + endforeach() +endif() diff --git a/lib/kokkos/CONTRIBUTING.md b/lib/kokkos/CONTRIBUTING.md index b4f3057cef..e97f8c4d89 100644 --- a/lib/kokkos/CONTRIBUTING.md +++ b/lib/kokkos/CONTRIBUTING.md @@ -7,6 +7,8 @@ We actively welcome pull requests. 3. If you've changed APIs, update the documentation. 4. Ensure the test suite passes. +Before sending your patch for review, please try to ensure that it is formatted properly. We use clang-format version 16 for this. + ## Issues We use GitHub issues to track public bugs. Please ensure your description is clear and has sufficient instructions to be able to reproduce the issue. diff --git a/lib/kokkos/HOW_TO_SNAPSHOT b/lib/kokkos/HOW_TO_SNAPSHOT deleted file mode 100644 index ad3f78efb4..0000000000 --- a/lib/kokkos/HOW_TO_SNAPSHOT +++ /dev/null @@ -1,73 +0,0 @@ - -Developers of Kokkos (those who commit modifications to Kokkos) -must maintain the snapshot of Kokkos in the Trilinos repository. - -This file contains instructions for how to -snapshot Kokkos from github.com/kokkos to Trilinos. - ------------------------------------------------------------------------- -*** EVERYTHING GOES RIGHT WORKFLOW *** - -1) Given a 'git clone' of Kokkos and of Trilinos repositories. -1.1) Let ${KOKKOS} be the absolute path to the Kokkos clone. - This path *must* terminate with the directory name 'kokkos'; - e.g., ${HOME}/kokkos . -1.2) Let ${TRILINOS} be the absolute path to the Trilinos directory. - -2) Given that the Kokkos build & test is clean and - changes are committed to the Kokkos clone. - -3) Snapshot the current commit in the Kokkos clone into the Trilinos clone. - This overwrites ${TRILINOS}/packages/kokkos with the content of ${KOKKOS}: - ${KOKKOS}/scripts/snapshot.py --verbose ${KOKKOS} ${TRILINOS}/packages - -4) Verify the snapshot commit happened as expected - cd ${TRILINOS}/packages/kokkos - git log -1 --name-only - -5) Modify, build, and test Trilinos with the Kokkos snapshot. - -6) Given that that the Trilinos build & test is clean and - changes are committed to the Trilinos clone. - -7) Attempt push to the Kokkos repository. - If push fails then you must 'remove the Kokkos snapshot' - from your Trilinos clone. - See below. - -8) Attempt to push to the Trilinos repository. - If updating for a failed push requires you to change Kokkos you must - 'remove the Kokkos snapshot' from your Trilinos clone. - See below. - ------------------------------------------------------------------------- -*** WHEN SOMETHING GOES WRONG AND YOU MUST *** -*** REMOVE THE KOKKOS SNAPSHOT FROM YOUR TRILINOS CLONE *** - -1) Query the Trilinos clone commit log. - git log --oneline - -2) Note the of the commit to the Trillinos clone - immediately BEFORE the Kokkos snapshot commit. - Copy this for use in the next command. - -3) IF more than one outstanding commit then you can remove just the - Kokkos snapshot commit with 'git rebase -i'. Edit the rebase file. - Remove or comment out the Kokkos snapshot commit entry. - git rebase -i - -4) IF the Kokkos snapshot commit is the one and only - outstanding commit then remove just than commit. - git reset --hard HEAD~1 - ------------------------------------------------------------------------- -*** REGARDING 'snapshot.py' TOOL *** - -The 'snapshot.py' tool is developed and maintained by the -Center for Computing Research (CCR) -Software Engineering, Maintenance, and Support (SEMS) team. - -Contact Brent Perschbacher for questions> - ------------------------------------------------------------------------- - diff --git a/lib/kokkos/Makefile.kokkos b/lib/kokkos/Makefile.kokkos index 6b627dcc36..abdfb7a316 100644 --- a/lib/kokkos/Makefile.kokkos +++ b/lib/kokkos/Makefile.kokkos @@ -1,6 +1,6 @@ # Default settings common options. -#LAMMPS specific settings: +#SPARTA specific settings: ifndef KOKKOS_PATH KOKKOS_PATH=../../lib/kokkos endif @@ -11,7 +11,7 @@ CXXFLAGS += $(SHFLAGS) endif KOKKOS_VERSION_MAJOR = 4 -KOKKOS_VERSION_MINOR = 4 +KOKKOS_VERSION_MINOR = 5 KOKKOS_VERSION_PATCH = 1 KOKKOS_VERSION = $(shell echo $(KOKKOS_VERSION_MAJOR)*10000+$(KOKKOS_VERSION_MINOR)*100+$(KOKKOS_VERSION_PATCH) | bc) @@ -23,7 +23,7 @@ KOKKOS_DEVICES ?= "OpenMP" # NVIDIA: Kepler,Kepler30,Kepler32,Kepler35,Kepler37,Maxwell,Maxwell50,Maxwell52,Maxwell53,Pascal60,Pascal61,Volta70,Volta72,Turing75,Ampere80,Ampere86,Ada89,Hopper90 # ARM: ARMv80,ARMv81,ARMv8-ThunderX,ARMv8-TX2,A64FX,ARMv9-Grace # IBM: Power8,Power9 -# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 +# AMD-GPUS: AMD_GFX906,AMD_GFX908,AMD_GFX90A,AMD_GFX940,AMD_GFX942,AMD_GFX942_APU,AMD_GFX1030,AMD_GFX1100,AMD_GFX1103 # AMD-CPUS: AMDAVX,Zen,Zen2,Zen3 # Intel-GPUs: Intel_Gen,Intel_Gen9,Intel_Gen11,Intel_Gen12LP,Intel_DG1,Intel_XeHP,Intel_PVC KOKKOS_ARCH ?= "" @@ -40,16 +40,19 @@ KOKKOS_TRIBITS ?= "no" KOKKOS_STANDALONE_CMAKE ?= "no" # Default settings specific options. -# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,disable_malloc_async -KOKKOS_CUDA_OPTIONS ?= "disable_malloc_async" +# Options: force_uvm,use_ldg,rdc,enable_lambda,enable_constexpr,enable_malloc_async +KOKKOS_CUDA_OPTIONS ?= "" -# Options: rdc +# Options: rdc,enable_malloc_async KOKKOS_HIP_OPTIONS ?= "" # Default settings specific options. # Options: enable_async_dispatch KOKKOS_HPX_OPTIONS ?= "" +#Options : force_host_as_device +KOKKOS_OPENACC_OPTIONS ?= "" + # Helper functions for conversion to upper case uppercase_TABLE:=a,A b,B c,C d,D e,E f,F g,G h,H i,I j,J k,K l,L m,M n,N o,O p,P q,Q r,R s,S t,T u,U v,V w,W x,X y,Y z,Z uppercase_internal=$(if $1,$$(subst $(firstword $1),$(call uppercase_internal,$(wordlist 2,$(words $1),$1),$2)),$2) @@ -92,7 +95,7 @@ KOKKOS_INTERNAL_CUDA_USE_UVM := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS), KOKKOS_INTERNAL_CUDA_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),rdc) KOKKOS_INTERNAL_CUDA_USE_LAMBDA := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_lambda) KOKKOS_INTERNAL_CUDA_USE_CONSTEXPR := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_constexpr) -KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),disable_malloc_async) +KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_CUDA_OPTIONS),enable_malloc_async) KOKKOS_INTERNAL_HPX_ENABLE_ASYNC_DISPATCH := $(call kokkos_has_string,$(KOKKOS_HPX_OPTIONS),enable_async_dispatch) # deprecated KOKKOS_INTERNAL_ENABLE_DESUL_ATOMICS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_desul_atomics) @@ -103,6 +106,8 @@ KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE := $(call kokkos_has_string,$(KOKKOS_OPT KOKKOS_INTERNAL_ENABLE_DEPRECATION_WARNINGS := $(call kokkos_has_string,$(KOKKOS_OPTIONS),enable_deprecation_warnings) KOKKOS_INTERNAL_HIP_USE_RELOC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),rdc) +KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC := $(call kokkos_has_string,$(KOKKOS_HIP_OPTIONS),enable_malloc_async) +KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE := $(call kokkos_has_string,$(KOKKOS_OPENACC_OPTIONS),force_host_as_device) # Check for Kokkos Host Execution Spaces one of which must be on. KOKKOS_INTERNAL_USE_OPENMP := $(call kokkos_has_string,$(subst OpenMPTarget,,$(KOKKOS_DEVICES)),OpenMP) @@ -178,7 +183,7 @@ KOKKOS_INTERNAL_COMPILER_CRAY := $(strip $(shell $(CXX) -craype-verbose 2 KOKKOS_INTERNAL_COMPILER_NVCC := $(strip $(shell echo "$(shell export OMPI_CXX=$(OMPI_CXX); export MPICH_CXX=$(MPICH_CXX); $(CXX) --version 2>&1 | grep -c nvcc)>0" | bc)) KOKKOS_INTERNAL_COMPILER_NVHPC := $(strip $(shell $(CXX) --version 2>&1 | grep -c "nvc++")) KOKKOS_INTERNAL_COMPILER_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),clang) -KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -c "clang++")) +KOKKOS_INTERNAL_COMPILER_CRAY_CLANG := $(strip $(shell $(CXX) -craype-verbose 2>&1 | grep -v "error:" | grep -c "clang++")) KOKKOS_INTERNAL_COMPILER_INTEL_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),oneAPI) KOKKOS_INTERNAL_COMPILER_APPLE_CLANG := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),Apple clang) KOKKOS_INTERNAL_COMPILER_HCC := $(call kokkos_has_string,$(KOKKOS_CXX_VERSION),HCC) @@ -292,6 +297,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) # Set OpenACC flags. ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) KOKKOS_INTERNAL_OPENACC_FLAG := -acc + else ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_OPENACC_FLAG := -fopenacc -fopenacc-fake-async-wait -fopenacc-implicit-worker=vector -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version -Wno-pass-failed else $(error Makefile.kokkos: OpenACC is enabled but the compiler must be NVHPC (got version string $(KOKKOS_CXX_VERSION))) endif @@ -411,8 +418,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) - KOKKOS_INTERNAL_NVCC_PATH := $(shell which nvcc) CUDA_PATH ?= $(KOKKOS_INTERNAL_NVCC_PATH:/bin/nvcc=) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_INTERNAL_OPENMPTARGET_FLAG := $(KOKKOS_INTERNAL_OPENMPTARGET_FLAG) --cuda-path=$(CUDA_PATH) @@ -457,6 +464,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 0) endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX940) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942) +KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX942_APU) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1030) ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1030) @@ -466,6 +474,15 @@ ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 0) KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100 := $(call kokkos_has_string,$(KOKKOS_ARCH),NAVI1100) endif KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103 := $(call kokkos_has_string,$(KOKKOS_ARCH),AMD_GFX1103) +KOKKOS_INTERNAL_USE_ARCH_AMD := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100) \ + + $(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103)) # Any AVX? KOKKOS_INTERNAL_USE_ARCH_AVX := $(shell expr $(KOKKOS_INTERNAL_USE_ARCH_SNB) + $(KOKKOS_INTERNAL_USE_ARCH_AMDAVX)) @@ -561,6 +578,9 @@ endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE") + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -733,7 +753,7 @@ ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) endif endif - ifeq ($(KOKKOS_INTERNAL_CUDA_DISABLE_MALLOC_ASYNC), 0) + ifeq ($(KOKKOS_INTERNAL_CUDA_ENABLE_MALLOC_ASYNC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC") else tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC */") @@ -1024,86 +1044,122 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) endif endif +ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + KOKKOS_INTERNAL_CUDA_ARCH_FLAG=--offload-arch + endif +endif + # Do not add this flag if its the cray compiler or the nvhpc compiler. ifeq ($(KOKKOS_INTERNAL_COMPILER_CRAY_CLANG), 0) - ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) - # Lets start with adding architecture defines - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + # Lets start with adding architecture defines + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER30), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER30") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_30 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER32), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER32") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_32 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER35), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER35") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_35 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_KEPLER37), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_KEPLER37") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_37 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL50), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL50") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_50 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL52), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL52") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_52 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_MAXWELL53), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_MAXWELL53") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_53 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL60), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL60") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_60 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_PASCAL61), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_PASCAL61") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_61 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA70), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA70") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_70 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_VOLTA72), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_VOLTA72") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_72 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_TURING75), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_TURING75") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_75 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE80), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE80") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_80 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMPERE86), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMPERE86") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_86 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_ADA89), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_ADA89") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_89 endif - ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + endif + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_HOPPER90), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_HOPPER90") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 0) KOKKOS_INTERNAL_CUDA_ARCH_FLAG := $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG)=sm_90 endif endif @@ -1119,6 +1175,9 @@ ifneq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 0) ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) endif + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_CUDA_ARCH_FLAG) + endif endif endif @@ -1126,43 +1185,48 @@ endif # Figure out the architecture flag for ROCm. ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX906), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX906") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx906 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx906\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx906 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX908), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX908") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx908 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx908\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx908 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX90A), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX90A") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx90a + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx90A\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx90a endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX940), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX940") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx940 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx940\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx940 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx942 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 +endif +ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX942_APU), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX942_APU") + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx942\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx942 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1030), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1030") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1030 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1030\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1030 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1100), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1100") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1100 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1100\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1100 endif ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD_GFX1103), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GFX1103") - tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU") - KOKKOS_INTERNAL_HIP_ARCH_FLAG := --offload-arch=gfx1103 + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ARCH_AMD_GPU \"gfx1103\"") + KOKKOS_INTERNAL_AMD_ARCH_FLAG := --offload-arch=gfx1103 endif @@ -1171,8 +1235,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_SRC += $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/HIP/*.hpp) - KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) - KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_HIP_ARCH_FLAG) + KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_AMD_ARCH_FLAG) ifeq ($(KOKKOS_INTERNAL_HIP_USE_RELOC), 1) tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE") @@ -1182,6 +1246,21 @@ ifeq ($(KOKKOS_INTERNAL_USE_HIP), 1) KOKKOS_CXXFLAGS+=-fno-gpu-rdc KOKKOS_LDFLAGS+=-fno-gpu-rdc endif + + ifeq ($(KOKKOS_INTERNAL_HIP_ENABLE_MALLOC_ASYNC), 1) + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC") + else + tmp := $(call kokkos_append_header,"/* $H""undef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC */") + endif +endif + +ifneq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 0) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) + KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_AMD_ARCH_FLAG) + endif + endif endif # Figure out Intel architecture flags. @@ -1235,6 +1314,8 @@ ifeq ($(KOKKOS_INTERNAL_USE_SYCL), 1) KOKKOS_CXXFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) KOKKOS_LDFLAGS+=-fsycl KOKKOS_LDFLAGS+=$(KOKKOS_INTERNAL_INTEL_ARCH_FLAG) + + tmp := $(call kokkos_append_header,"$H""define KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE") endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) @@ -1322,6 +1403,8 @@ ifneq ($(KOKKOS_INTERNAL_NEW_CONFIG), 0) endif KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/*.hpp) +KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/View/MDSpan/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/core/src/impl/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/*.hpp) KOKKOS_HEADERS += $(wildcard $(KOKKOS_PATH)/containers/src/impl/*.hpp) @@ -1374,6 +1457,48 @@ ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) KOKKOS_CXXFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LDFLAGS += $(KOKKOS_INTERNAL_OPENACC_FLAG) KOKKOS_LIBS += $(KOKKOS_INTERNAL_OPENACC_LIB) + ifeq ($(KOKKOS_INTERNAL_USE_ARCH_NVIDIA), 1) + ifneq ($(CUDA_PATH),) + ifeq ($(call kokkos_path_exists,$(CUDA_PATH)/lib), 1) + CUDA_PATH := $(CUDA_PATH:/compilers=/cuda) + endif + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(CUDA_PATH),) + KOKKOS_LDFLAGS += -L$(CUDA_PATH)/lib64 + endif + KOKKOS_LIBS += -lcudart + endif + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_LIBS += -cuda + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_USE_ARCH_AMD), 1) + ifeq ($(KOKKOS_INTERNAL_COMPILER_CLANG), 1) + ifneq ($(ROCM_PATH),) + KOKKOS_CPPFLAGS += -I$(ROCM_PATH)/include + KOKKOS_LDFLAGS += -L$(ROCM_PATH)/lib + endif + KOKKOS_LIBS += -lamdhip64 + endif + ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + $(error If a GPU architecture is specified, KOKKOS_OPENACC_OPTIONS = force_host_as_device cannot be used. Disable the force_host_as_device option) + endif + else ifeq ($(KOKKOS_INTERNAL_OPENACC_FORCE_HOST_AS_DEVICE), 1) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=multicore + endif + else + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + KOKKOS_CXXFLAGS += -acc=gpu,multicore + endif + endif endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMP), 1) @@ -1484,7 +1609,11 @@ else endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) - tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + ifeq ($(KOKKOS_INTERNAL_COMPILER_NVHPC), 1) + tmp := $(call desul_append_header,"$H""define DESUL_ATOMICS_ENABLE_OPENACC") + else + tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") + endif else tmp := $(call desul_append_header,"/* $H""undef DESUL_ATOMICS_ENABLE_OPENACC */") endif @@ -1512,6 +1641,12 @@ $(DESUL_CONFIG_HEADER): KOKKOS_CPP_DEPENDS := $(DESUL_CONFIG_HEADER) KokkosCore_config.h $(KOKKOS_HEADERS) +# Tasking is deprecated +ifeq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) + TMP_KOKKOS_SRC := $(KOKKOS_SRC) + KOKKOS_SRC = $(patsubst %Task.cpp,, $(TMP_KOKKOS_SRC)) +endif + KOKKOS_OBJ = $(KOKKOS_SRC:.cpp=.o) KOKKOS_OBJ_LINK = $(notdir $(KOKKOS_OBJ)) diff --git a/lib/kokkos/Makefile.targets b/lib/kokkos/Makefile.targets index e8e429e027..be535eea3e 100644 --- a/lib/kokkos/Makefile.targets +++ b/lib/kokkos/Makefile.targets @@ -16,8 +16,6 @@ Kokkos_HostSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Ho $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostSpace.cpp Kokkos_hwloc.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_hwloc.cpp -Kokkos_TaskQueue.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_TaskQueue.cpp Kokkos_HostThreadTeam.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/impl/Kokkos_HostThreadTeam.cpp Kokkos_HostBarrier.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_HostBarrier.cpp @@ -38,17 +36,21 @@ Kokkos_Abort.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/impl/Kokkos_Abort. ifeq ($(KOKKOS_INTERNAL_USE_SERIAL), 1) Kokkos_Serial.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Serial_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Serial/Kokkos_Serial_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_CUDA), 1) Kokkos_Cuda_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Instance.cpp Kokkos_CudaSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_CudaSpace.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_Cuda_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/Cuda/Kokkos_Cuda_Task.cpp +endif Lock_Array_CUDA.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_CUDA.cpp endif @@ -73,6 +75,8 @@ Kokkos_HIP_Space.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Space.cpp Kokkos_HIP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_Instance.cpp +Kokkos_HIP_ZeroMemset.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp + $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp Lock_Array_HIP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/tpls/desul/src/Lock_Array_HIP.cpp endif @@ -89,26 +93,26 @@ Kokkos_OpenMP.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_Ope $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP.cpp Kokkos_OpenMP_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Instance.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_OpenMP_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMP/Kokkos_OpenMP_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_HPX), 1) Kokkos_HPX.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX.cpp +ifneq ($(KOKKOS_INTERNAL_DISABLE_DEPRECATED_CODE), 1) Kokkos_HPX_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/HPX/Kokkos_HPX_Task.cpp endif +endif ifeq ($(KOKKOS_INTERNAL_USE_OPENMPTARGET), 1) -Kokkos_OpenMPTarget_Exec.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp Kokkos_OpenMPTarget_Instance.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp Kokkos_OpenMPTargetSpace.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.cpp -Kokkos_OpenMPTarget_Task.o: $(KOKKOS_CPP_DEPENDS) $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp - $(CXX) $(KOKKOS_CPPFLAGS) $(KOKKOS_CXXFLAGS) $(CXXFLAGS) -c $(KOKKOS_PATH)/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp endif ifeq ($(KOKKOS_INTERNAL_USE_OPENACC), 1) diff --git a/lib/kokkos/README.md b/lib/kokkos/README.md index c8c6f8f7cf..56159b35c2 100644 --- a/lib/kokkos/README.md +++ b/lib/kokkos/README.md @@ -30,12 +30,12 @@ To start learning about Kokkos: The latest release of Kokkos can be obtained from the [GitHub releases page](https://github.com/kokkos/kokkos/releases/latest). -The current release is [4.3.01](https://github.com/kokkos/kokkos/releases/tag/4.3.01). +The current release is [4.5.01](https://github.com/kokkos/kokkos/releases/tag/4.5.01). ```bash -curl -OJ -L https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +curl -OJ -L https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz # Or with wget -wget https://github.com/kokkos/kokkos/archive/refs/tags/4.3.01.tar.gz +wget https://github.com/kokkos/kokkos/releases/download/4.5.01/kokkos-4.5.01.tar.gz ``` To clone the latest development version of Kokkos from GitHub: diff --git a/lib/kokkos/algorithms/CMakeLists.txt b/lib/kokkos/algorithms/CMakeLists.txt index 368984647e..73ce9f7ec5 100644 --- a/lib/kokkos/algorithms/CMakeLists.txt +++ b/lib/kokkos/algorithms/CMakeLists.txt @@ -1,7 +1,7 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) - KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -ENDIF() +if(NOT ((KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) OR KOKKOS_ENABLE_OPENACC)) + kokkos_add_test_directories(unit_tests) +endif() diff --git a/lib/kokkos/algorithms/src/CMakeLists.txt b/lib/kokkos/algorithms/src/CMakeLists.txt index b490caca62..9f10b85e02 100644 --- a/lib/kokkos/algorithms/src/CMakeLists.txt +++ b/lib/kokkos/algorithms/src/CMakeLists.txt @@ -1,34 +1,29 @@ #I have to leave these here for tribits -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -FILE(GLOB ALGO_HEADERS *.hpp) -FILE(GLOB ALGO_SOURCES *.cpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) -APPEND_GLOB(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) +file(GLOB ALGO_HEADERS *.hpp) +file(GLOB ALGO_SOURCES *.cpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/*.hpp) +append_glob(ALGO_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/std_algorithms/impl/*.hpp) -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) #----------------------------------------------------------------------------- # We have to pass the sources in here for Tribits # These will get ignored for standalone CMake and a true interface library made -KOKKOS_ADD_INTERFACE_LIBRARY( - kokkosalgorithms - NOINSTALLHEADERS ${ALGO_HEADERS} - SOURCES ${ALGO_SOURCES} -) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkosalgorithms - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_add_interface_library(kokkosalgorithms NOINSTALLHEADERS ${ALGO_HEADERS} SOURCES ${ALGO_SOURCES}) +kokkos_lib_include_directories( + kokkosalgorithms ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_TPL(kokkoscontainers PUBLIC ROCTHRUST) -KOKKOS_LINK_TPL(kokkoscore PUBLIC ONEDPL) +kokkos_link_tpl(kokkoscontainers PUBLIC ROCTHRUST) +kokkos_link_tpl(kokkoscore PUBLIC ONEDPL) diff --git a/lib/kokkos/algorithms/src/Kokkos_Random.hpp b/lib/kokkos/algorithms/src/Kokkos_Random.hpp index 7df12b8518..b28ea4c2ca 100644 --- a/lib/kokkos/algorithms/src/Kokkos_Random.hpp +++ b/lib/kokkos/algorithms/src/Kokkos_Random.hpp @@ -615,7 +615,7 @@ template struct Random_UniqueIndex { using locks_view_type = View; KOKKOS_FUNCTION - static int get_state_idx(const locks_view_type) { + static int get_state_idx(const locks_view_type&) { KOKKOS_IF_ON_HOST( (return DeviceType::execution_space::impl_hardware_thread_id();)) @@ -665,17 +665,16 @@ struct Random_UniqueIndex< #ifdef KOKKOS_ENABLE_SYCL template -struct Random_UniqueIndex< - Kokkos::Device> { +struct Random_UniqueIndex> { using locks_view_type = - View>; + View>; KOKKOS_FUNCTION static int get_state_idx(const locks_view_type& locks_) { auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; std::size_t gridDim[3] = { @@ -1121,7 +1120,7 @@ class Random_XorShift1024_Pool { using execution_space = typename device_type::execution_space; using locks_type = View; using int_view_type = View; - using state_data_type = View; + using state_data_type = View; locks_type locks_ = {}; state_data_type state_ = {}; diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp index 73e751f572..8e7de32a07 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinOpsPublicAPI.hpp @@ -35,11 +35,11 @@ struct BinOp1D { #endif // Construct BinOp with number of bins, minimum value and maximum value - BinOp1D(int max_bins__, typename KeyViewType::const_value_type min, + BinOp1D(int max_bins, typename KeyViewType::const_value_type min, typename KeyViewType::const_value_type max) - : max_bins_(max_bins__ + 1), + : max_bins_(max_bins + 1), // Cast to double to avoid possible overflow when using integer - mul_(static_cast(max_bins__) / + mul_(static_cast(max_bins) / (static_cast(max) - static_cast(min))), min_(static_cast(min)) { // For integral types the number of bins may be larger than the range @@ -47,7 +47,7 @@ struct BinOp1D { // and then don't need to sort bins. if (std::is_integral::value && (static_cast(max) - static_cast(min)) <= - static_cast(max_bins__)) { + static_cast(max_bins)) { mul_ = 1.; } } @@ -82,16 +82,16 @@ struct BinOp3D { BinOp3D() = delete; #endif - BinOp3D(int max_bins__[], typename KeyViewType::const_value_type min[], + BinOp3D(int max_bins[], typename KeyViewType::const_value_type min[], typename KeyViewType::const_value_type max[]) { - max_bins_[0] = max_bins__[0]; - max_bins_[1] = max_bins__[1]; - max_bins_[2] = max_bins__[2]; - mul_[0] = static_cast(max_bins__[0]) / + max_bins_[0] = max_bins[0]; + max_bins_[1] = max_bins[1]; + max_bins_[2] = max_bins[2]; + mul_[0] = static_cast(max_bins[0]) / (static_cast(max[0]) - static_cast(min[0])); - mul_[1] = static_cast(max_bins__[1]) / + mul_[1] = static_cast(max_bins[1]) / (static_cast(max[1]) - static_cast(min[1])); - mul_[2] = static_cast(max_bins__[2]) / + mul_[2] = static_cast(max_bins[2]) / (static_cast(max[2]) - static_cast(min[2])); min_[0] = static_cast(min[0]); min_[1] = static_cast(min[1]); diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp index c399279fe4..f417b6b13b 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_BinSortPublicAPI.hpp @@ -388,7 +388,8 @@ class BinSort { // reasonable experimentally. if (use_std_sort && bin_size > 10) { KOKKOS_IF_ON_HOST( - (std::sort(&sort_order(lower_bound), &sort_order(upper_bound), + (std::sort(sort_order.data() + lower_bound, + sort_order.data() + upper_bound, [this](int p, int q) { return bin_op(keys_rnd, p, q); });)) } else { for (int k = lower_bound + 1; k < upper_bound; ++k) { diff --git a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp index 308e9e3a00..20026c77e4 100644 --- a/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp +++ b/lib/kokkos/algorithms/src/sorting/Kokkos_SortPublicAPI.hpp @@ -53,9 +53,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort without comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size()); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last); + } } else { Impl::sort_device_view_without_comparator(exec, view); } @@ -107,9 +111,13 @@ void sort(const ExecutionSpace& exec, if constexpr (Impl::better_off_calling_std_sort_v) { exec.fence("Kokkos::sort with comparator use std::sort"); - auto first = ::Kokkos::Experimental::begin(view); - auto last = ::Kokkos::Experimental::end(view); - std::sort(first, last, comparator); + if (view.span_is_contiguous()) { + std::sort(view.data(), view.data() + view.size(), comparator); + } else { + auto first = ::Kokkos::Experimental::begin(view); + auto last = ::Kokkos::Experimental::end(view); + std::sort(first, last, comparator); + } } else { Impl::sort_device_view_with_comparator(exec, view, comparator); } diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp index f11f807048..2a8f761d9b 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortByKeyImpl.hpp @@ -30,6 +30,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -76,13 +77,10 @@ namespace Kokkos::Impl { template constexpr inline bool is_admissible_to_kokkos_sort_by_key = - ::Kokkos::is_view::value&& T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value); + ::Kokkos::is_view::value && T::rank() == 1 && + (std::is_same_v || + std::is_same_v || + std::is_same_v); template KOKKOS_INLINE_FUNCTION constexpr void @@ -144,7 +142,7 @@ void sort_by_key_rocthrust( #if defined(KOKKOS_ENABLE_ONEDPL) template -inline constexpr bool sort_on_device_v = +inline constexpr bool sort_on_device_v = std::is_same_v || std::is_same_v; @@ -152,7 +150,7 @@ inline constexpr bool sort_on_device_v = template void sort_by_key_onedpl( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, MaybeComparator&&... maybeComparator) { @@ -176,7 +174,7 @@ template void applyPermutation(const ExecutionSpace& space, const PermutationView& permutation, const ViewType& view) { - static_assert(std::is_integral::value); + static_assert(std::is_integral_v); auto view_copy = Kokkos::create_mirror( Kokkos::view_alloc(space, typename ExecutionSpace::memory_space{}, @@ -335,7 +333,7 @@ void sort_by_key_device_view_without_comparator( template void sort_by_key_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values) { #ifdef KOKKOS_ONEDPL_HAS_SORT_BY_KEY @@ -392,7 +390,7 @@ void sort_by_key_device_view_with_comparator( template void sort_by_key_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& keys, const Kokkos::View& values, const ComparatorType& comparator) { diff --git a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp index 0894622891..734ce450f6 100644 --- a/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp +++ b/lib/kokkos/algorithms/src/sorting/impl/Kokkos_SortImpl.hpp @@ -34,6 +34,7 @@ #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wshadow" +#pragma GCC diagnostic ignored "-Wsuggest-override" #if defined(KOKKOS_COMPILER_CLANG) // Some versions of Clang fail to compile Thrust, failing with errors like @@ -146,7 +147,7 @@ void sort_via_binsort(const ExecutionSpace& exec, bool sort_in_bins = true; // TODO: figure out better max_bins then this ... int64_t max_bins = view.extent(0) / 2; - if (std::is_integral::value) { + if (std::is_integral_v) { // Cast to double to avoid possible overflow when using integer auto const max_val = static_cast(result.max_val); auto const min_val = static_cast(result.min_val); @@ -157,7 +158,7 @@ void sort_via_binsort(const ExecutionSpace& exec, sort_in_bins = false; } } - if (std::is_floating_point::value) { + if (std::is_floating_point_v) { KOKKOS_ASSERT(std::isfinite(static_cast(result.max_val) - static_cast(result.min_val))); } @@ -211,11 +212,11 @@ void sort_rocthrust(const HIP& space, #if defined(KOKKOS_ENABLE_ONEDPL) template -void sort_onedpl(const Kokkos::Experimental::SYCL& space, +void sort_onedpl(const Kokkos::SYCL& space, const Kokkos::View& view, MaybeComparator&&... maybeComparator) { using ViewType = Kokkos::View; - static_assert(SpaceAccessibility::accessible, "SYCL execution space is not able to access the memory space " "of the View argument!"); @@ -268,19 +269,29 @@ void copy_to_host_run_stdsort_copy_back( KE::copy(exec, view, view_dc); // run sort on the mirror of view_dc - auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); - auto first = KE::begin(mv_h); - auto last = KE::end(mv_h); - std::sort(first, last, std::forward(maybeComparator)...); + auto mv_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view_dc); + if (view.span_is_contiguous()) { + std::sort(mv_h.data(), mv_h.data() + mv_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(mv_h); + auto last = KE::end(mv_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view_dc, mv_h); // copy back to argument view KE::copy(exec, KE::cbegin(view_dc), KE::cend(view_dc), KE::begin(view)); } else { auto view_h = create_mirror_view_and_copy(Kokkos::HostSpace(), view); - auto first = KE::begin(view_h); - auto last = KE::end(view_h); - std::sort(first, last, std::forward(maybeComparator)...); + if (view.span_is_contiguous()) { + std::sort(view_h.data(), view_h.data() + view_h.size(), + std::forward(maybeComparator)...); + } else { + auto first = KE::begin(view_h); + auto last = KE::end(view_h); + std::sort(first, last, std::forward(maybeComparator)...); + } Kokkos::deep_copy(exec, view, view_h); } } @@ -310,7 +321,7 @@ void sort_device_view_without_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_without_comparator( - const Kokkos::Experimental::SYCL& exec, + const Kokkos::SYCL& exec, const Kokkos::View& view) { using ViewType = Kokkos::View; static_assert( @@ -365,8 +376,7 @@ void sort_device_view_with_comparator( #if defined(KOKKOS_ENABLE_ONEDPL) template void sort_device_view_with_comparator( - const Kokkos::Experimental::SYCL& exec, - const Kokkos::View& view, + const Kokkos::SYCL& exec, const Kokkos::View& view, const ComparatorType& comparator) { using ViewType = Kokkos::View; static_assert( @@ -397,12 +407,12 @@ sort_device_view_with_comparator( // and then copies data back. Potentially, this can later be changed // with a better solution like our own quicksort on device or similar. - using ViewType = Kokkos::View; - using MemSpace = typename ViewType::memory_space; // Note with HIP unified memory this code path is still the right thing to do // if we end up here when RocThrust is not enabled. // The create_mirror_view_and_copy will do the right thing (no copy). -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY + using ViewType = Kokkos::View; + using MemSpace = typename ViewType::memory_space; static_assert(!SpaceAccessibility::accessible, "Impl::sort_device_view_with_comparator: should not be called " "on a view that is already accessible on the host"); diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp index b84f00f8bb..ea7e55ca61 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_Reduce.hpp @@ -91,7 +91,7 @@ template = 0> ValueType reduce(const ExecutionSpace& ex, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl( @@ -105,7 +105,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_exespace_impl(label, ex, first, last, @@ -119,7 +119,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -137,7 +137,7 @@ template & view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -157,7 +157,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -172,7 +172,7 @@ template ::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_exespace_impl( @@ -186,7 +186,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -204,7 +204,7 @@ template & view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -258,7 +258,7 @@ template < KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_default_functors_team_impl(teamHandle, first, last, @@ -273,7 +273,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; @@ -294,7 +294,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, IteratorType first, IteratorType last, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::reduce_custom_functors_team_impl(teamHandle, first, last, @@ -309,7 +309,7 @@ KOKKOS_FUNCTION ValueType reduce(const TeamHandleType& teamHandle, const ::Kokkos::View& view, ValueType init_reduction_value, BinaryOp joiner) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); namespace KE = ::Kokkos::Experimental; diff --git a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp index 101f5113f6..89585ddbea 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/Kokkos_TransformReduce.hpp @@ -117,7 +117,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType1 first1, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -136,7 +136,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -157,7 +157,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -182,7 +182,7 @@ ValueType transform_reduce( ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -208,7 +208,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, IteratorType first1, IteratorType last1, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -228,7 +228,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_exespace_impl( @@ -248,7 +248,7 @@ ValueType transform_reduce(const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -270,7 +270,7 @@ ValueType transform_reduce(const std::string& label, const ExecutionSpace& ex, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); @@ -345,7 +345,7 @@ KOKKOS_FUNCTION ValueType transform_reduce( const TeamHandleType& teamHandle, IteratorType1 first1, IteratorType1 last1, IteratorType2 first2, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -366,7 +366,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, BinaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(first_view); @@ -393,7 +393,7 @@ KOKKOS_FUNCTION ValueType transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); return Impl::transform_reduce_custom_functors_team_impl( @@ -412,7 +412,7 @@ transform_reduce(const TeamHandleType& teamHandle, ValueType init_reduction_value, BinaryJoinerType joiner, UnaryTransform transformer) { namespace KE = ::Kokkos::Experimental; - static_assert(std::is_move_constructible::value, + static_assert(std::is_move_constructible_v, "ValueType must be move constructible."); Impl::static_assert_is_admissible_to_kokkos_std_algorithms(view); diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp index 54bb13e25b..da16141f5a 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Constraints.hpp @@ -33,12 +33,12 @@ struct is_admissible_to_kokkos_std_algorithms : std::false_type {}; template struct is_admissible_to_kokkos_std_algorithms< T, std::enable_if_t<::Kokkos::is_view::value && T::rank() == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value)>> + (std::is_same_v || + std::is_same_v || + std::is_same_v)>> : std::true_type {}; template @@ -102,8 +102,8 @@ struct are_random_access_iterators; template struct are_random_access_iterators { static constexpr bool value = - is_iterator_v && std::is_base_of::value; + is_iterator_v && std::is_base_of_v; }; template @@ -165,9 +165,8 @@ struct iterators_have_matching_difference_type { template struct iterators_have_matching_difference_type { - static constexpr bool value = - std::is_same::value; + static constexpr bool value = std::is_same_v; }; template diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp index 9075562d46..dc910861d5 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_MoveBackward.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdMoveBackwardFunctor { using index_type = typename IteratorType1::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdMoveBackwardFunctor requires signed index type"); IteratorType1 m_last; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp index 5bce89e98f..e8c638c94c 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_RandomAccessIterator.hpp @@ -36,18 +36,18 @@ class RandomAccessIterator< ::Kokkos::View > { using iterator_type = RandomAccessIterator; using iterator_category = std::random_access_iterator_tag; - using value_type = typename view_type::value_type; + using value_type = typename view_type::non_const_value_type; using difference_type = ptrdiff_t; using pointer = typename view_type::pointer_type; using reference = typename view_type::reference_type; static_assert(view_type::rank == 1 && - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), "RandomAccessIterator only supports 1D Views with LayoutLeft, " "LayoutRight, LayoutStride."); @@ -61,9 +61,9 @@ class RandomAccessIterator< ::Kokkos::View > { #ifndef KOKKOS_ENABLE_CXX17 // C++20 and beyond template - requires(std::is_constructible_v) KOKKOS_FUNCTION - explicit(!std::is_convertible_v) - RandomAccessIterator(const RandomAccessIterator& other) + requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit(!std::is_convertible_v) + RandomAccessIterator(const RandomAccessIterator& other) : m_view(other.m_view), m_current_index(other.m_current_index) {} #else template < diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp index b4046c7645..e6caa07288 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_Reverse.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseFunctor requires signed index type"); InputIterator m_first; diff --git a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp index dd20d90e39..7aa0e4fc44 100644 --- a/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp +++ b/lib/kokkos/algorithms/src/std_algorithms/impl/Kokkos_ReverseCopy.hpp @@ -30,7 +30,7 @@ namespace Impl { template struct StdReverseCopyFunctor { using index_type = typename InputIterator::difference_type; - static_assert(std::is_signed::value, + static_assert(std::is_signed_v, "Kokkos: StdReverseCopyFunctor requires signed index type"); InputIterator m_last; diff --git a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt index db184bc8a9..31247af159 100644 --- a/lib/kokkos/algorithms/unit_tests/CMakeLists.txt +++ b/lib/kokkos/algorithms/unit_tests/CMakeLists.txt @@ -1,12 +1,10 @@ - #Leave these here for now - I don't need transitive deps anyway -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) - -SET(ALGORITHM UnitTestMain.cpp) +set(ALGORITHM UnitTestMain.cpp) foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) string(TOUPPER ${Tag} DEVICE) @@ -23,21 +21,11 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # Generate a .cpp file for each one that runs it on the current backend (Tag), # and add this .cpp file to the sources for UnitTest_RandomAndSort. set(ALGO_SORT_SOURCES) - foreach(SOURCE_Input - TestSort - TestSortByKey - TestSortCustomComp - TestBinSortA - TestBinSortB - TestNestedSort - ) + foreach(SOURCE_Input TestSort TestSortByKey TestSortCustomComp TestBinSortA TestBinSortB TestNestedSort) set(file ${dir}/${SOURCE_Input}.cpp) # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_SORT_SOURCES ${file}) endforeach() @@ -47,14 +35,9 @@ foreach(Tag Threads;Serial;OpenMP;Cuda;HPX;HIP;SYCL;OpenMPTarget) # ------------------------------------------ # do as above set(ALGO_RANDOM_SOURCES) - foreach(SOURCE_Input - TestRandom - ) + foreach(SOURCE_Input TestRandom) set(file ${dir}/${SOURCE_Input}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include <${SOURCE_Input}.hpp>\n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include <${SOURCE_Input}.hpp>\n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND ALGO_RANDOM_SOURCES ${file}) endforeach() @@ -65,11 +48,7 @@ endforeach() # std set A # ------------------------------------------ set(STDALGO_SOURCES_A) -foreach(Name - StdReducers - StdAlgorithmsConstraints - RandomAccessIterator - ) +foreach(Name StdReducers StdAlgorithmsConstraints RandomAccessIterator) list(APPEND STDALGO_SOURCES_A Test${Name}.cpp) endforeach() @@ -77,10 +56,7 @@ endforeach() # std set B # ------------------------------------------ set(STDALGO_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsMinMaxElementOps - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsMinMaxElementOps) list(APPEND STDALGO_SOURCES_B Test${Name}.cpp) endforeach() @@ -88,22 +64,23 @@ endforeach() # std set C # ------------------------------------------ set(STDALGO_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsLexicographicalCompare - StdAlgorithmsForEach - StdAlgorithmsFind - StdAlgorithmsFindFirstOf - StdAlgorithmsFindEnd - StdAlgorithmsCount - StdAlgorithmsEqual - StdAlgorithmsAllAnyNoneOf - StdAlgorithmsAdjacentFind - StdAlgorithmsSearch - StdAlgorithmsSearch_n - StdAlgorithmsMismatch - StdAlgorithmsMoveBackward - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsLexicographicalCompare + StdAlgorithmsForEach + StdAlgorithmsFind + StdAlgorithmsFindFirstOf + StdAlgorithmsFindEnd + StdAlgorithmsCount + StdAlgorithmsEqual + StdAlgorithmsAllAnyNoneOf + StdAlgorithmsAdjacentFind + StdAlgorithmsSearch + StdAlgorithmsSearch_n + StdAlgorithmsMismatch + StdAlgorithmsMoveBackward +) list(APPEND STDALGO_SOURCES_C Test${Name}.cpp) endforeach() @@ -111,27 +88,28 @@ endforeach() # std set D # ------------------------------------------ set(STDALGO_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsModOps - StdAlgorithmsModSeqOps - StdAlgorithmsReplace - StdAlgorithmsReplaceIf - StdAlgorithmsReplaceCopy - StdAlgorithmsReplaceCopyIf - StdAlgorithmsCopyIf - StdAlgorithmsUnique - StdAlgorithmsUniqueCopy - StdAlgorithmsRemove - StdAlgorithmsRemoveIf - StdAlgorithmsRemoveCopy - StdAlgorithmsRemoveCopyIf - StdAlgorithmsRotate - StdAlgorithmsRotateCopy - StdAlgorithmsReverse - StdAlgorithmsShiftLeft - StdAlgorithmsShiftRight - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsModOps + StdAlgorithmsModSeqOps + StdAlgorithmsReplace + StdAlgorithmsReplaceIf + StdAlgorithmsReplaceCopy + StdAlgorithmsReplaceCopyIf + StdAlgorithmsCopyIf + StdAlgorithmsUnique + StdAlgorithmsUniqueCopy + StdAlgorithmsRemove + StdAlgorithmsRemoveIf + StdAlgorithmsRemoveCopy + StdAlgorithmsRemoveCopyIf + StdAlgorithmsRotate + StdAlgorithmsRotateCopy + StdAlgorithmsReverse + StdAlgorithmsShiftLeft + StdAlgorithmsShiftRight +) list(APPEND STDALGO_SOURCES_D Test${Name}.cpp) endforeach() @@ -139,20 +117,21 @@ endforeach() # std set E # ------------------------------------------ set(STDALGO_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsIsSorted - StdAlgorithmsIsSortedUntil - StdAlgorithmsPartitioningOps - StdAlgorithmsPartitionCopy - StdAlgorithmsNumerics - StdAlgorithmsAdjacentDifference - StdAlgorithmsExclusiveScan - StdAlgorithmsInclusiveScan - StdAlgorithmsTransformUnaryOp - StdAlgorithmsTransformExclusiveScan - StdAlgorithmsTransformInclusiveScan - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsIsSorted + StdAlgorithmsIsSortedUntil + StdAlgorithmsPartitioningOps + StdAlgorithmsPartitionCopy + StdAlgorithmsNumerics + StdAlgorithmsAdjacentDifference + StdAlgorithmsExclusiveScan + StdAlgorithmsInclusiveScan + StdAlgorithmsTransformUnaryOp + StdAlgorithmsTransformExclusiveScan + StdAlgorithmsTransformInclusiveScan +) list(APPEND STDALGO_SOURCES_E Test${Name}.cpp) endforeach() @@ -160,11 +139,7 @@ endforeach() # std team Q # ------------------------------------------ set(STDALGO_TEAM_SOURCES_Q) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamInclusiveScan - StdAlgorithmsTeamTransformInclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamInclusiveScan StdAlgorithmsTeamTransformInclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_Q Test${Name}.cpp) endforeach() @@ -172,11 +147,7 @@ endforeach() # std team P # ------------------------------------------ set(STDALGO_TEAM_SOURCES_P) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamExclusiveScan - StdAlgorithmsTeamTransformExclusiveScan - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamExclusiveScan StdAlgorithmsTeamTransformExclusiveScan) list(APPEND STDALGO_TEAM_SOURCES_P Test${Name}.cpp) endforeach() @@ -184,14 +155,9 @@ endforeach() # std team M # ------------------------------------------ set(STDALGO_TEAM_SOURCES_M) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamTransformUnaryOp - StdAlgorithmsTeamTransformBinaryOp - StdAlgorithmsTeamGenerate - StdAlgorithmsTeamGenerate_n - StdAlgorithmsTeamSwapRanges - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamTransformUnaryOp StdAlgorithmsTeamTransformBinaryOp + StdAlgorithmsTeamGenerate StdAlgorithmsTeamGenerate_n StdAlgorithmsTeamSwapRanges +) list(APPEND STDALGO_TEAM_SOURCES_M Test${Name}.cpp) endforeach() @@ -199,14 +165,9 @@ endforeach() # std team L # ------------------------------------------ set(STDALGO_TEAM_SOURCES_L) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamIsSorted - StdAlgorithmsTeamIsSortedUntil - StdAlgorithmsTeamIsPartitioned - StdAlgorithmsTeamPartitionCopy - StdAlgorithmsTeamPartitionPoint - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamIsSorted StdAlgorithmsTeamIsSortedUntil + StdAlgorithmsTeamIsPartitioned StdAlgorithmsTeamPartitionCopy StdAlgorithmsTeamPartitionPoint +) list(APPEND STDALGO_TEAM_SOURCES_L Test${Name}.cpp) endforeach() @@ -214,13 +175,9 @@ endforeach() # std team I # ------------------------------------------ set(STDALGO_TEAM_SOURCES_I) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamUnique - StdAlgorithmsTeamAdjacentDifference - StdAlgorithmsTeamReduce - StdAlgorithmsTeamTransformReduce - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamUnique StdAlgorithmsTeamAdjacentDifference StdAlgorithmsTeamReduce + StdAlgorithmsTeamTransformReduce +) list(APPEND STDALGO_TEAM_SOURCES_I Test${Name}.cpp) endforeach() @@ -228,18 +185,19 @@ endforeach() # std team H # ------------------------------------------ set(STDALGO_TEAM_SOURCES_H) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamCopy - StdAlgorithmsTeamCopy_n - StdAlgorithmsTeamCopyBackward - StdAlgorithmsTeamCopyIf - StdAlgorithmsTeamUniqueCopy - StdAlgorithmsTeamRemove - StdAlgorithmsTeamRemoveIf - StdAlgorithmsTeamRemoveCopy - StdAlgorithmsTeamRemoveCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamCopy + StdAlgorithmsTeamCopy_n + StdAlgorithmsTeamCopyBackward + StdAlgorithmsTeamCopyIf + StdAlgorithmsTeamUniqueCopy + StdAlgorithmsTeamRemove + StdAlgorithmsTeamRemoveIf + StdAlgorithmsTeamRemoveCopy + StdAlgorithmsTeamRemoveCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_H Test${Name}.cpp) endforeach() @@ -247,13 +205,9 @@ endforeach() # std team G # ------------------------------------------ set(STDALGO_TEAM_SOURCES_G) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMove - StdAlgorithmsTeamMoveBackward - StdAlgorithmsTeamShiftLeft - StdAlgorithmsTeamShiftRight - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMove StdAlgorithmsTeamMoveBackward StdAlgorithmsTeamShiftLeft + StdAlgorithmsTeamShiftRight +) list(APPEND STDALGO_TEAM_SOURCES_G Test${Name}.cpp) endforeach() @@ -261,13 +215,9 @@ endforeach() # std team F # ------------------------------------------ set(STDALGO_TEAM_SOURCES_F) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamReverse - StdAlgorithmsTeamReverseCopy - StdAlgorithmsTeamRotate - StdAlgorithmsTeamRotateCopy - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamReverse StdAlgorithmsTeamReverseCopy StdAlgorithmsTeamRotate + StdAlgorithmsTeamRotateCopy +) list(APPEND STDALGO_TEAM_SOURCES_F Test${Name}.cpp) endforeach() @@ -275,15 +225,16 @@ endforeach() # std team E # ------------------------------------------ set(STDALGO_TEAM_SOURCES_E) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFill - StdAlgorithmsTeamFill_n - StdAlgorithmsTeamReplace - StdAlgorithmsTeamReplaceIf - StdAlgorithmsTeamReplaceCopy - StdAlgorithmsTeamReplaceCopyIf - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFill + StdAlgorithmsTeamFill_n + StdAlgorithmsTeamReplace + StdAlgorithmsTeamReplaceIf + StdAlgorithmsTeamReplaceCopy + StdAlgorithmsTeamReplaceCopyIf +) list(APPEND STDALGO_TEAM_SOURCES_E Test${Name}.cpp) endforeach() @@ -291,12 +242,7 @@ endforeach() # std team D # ------------------------------------------ set(STDALGO_TEAM_SOURCES_D) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamMinElement - StdAlgorithmsTeamMaxElement - StdAlgorithmsTeamMinMaxElement - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamMinElement StdAlgorithmsTeamMaxElement StdAlgorithmsTeamMinMaxElement) list(APPEND STDALGO_TEAM_SOURCES_D Test${Name}.cpp) endforeach() @@ -304,16 +250,17 @@ endforeach() # std team C # ------------------------------------------ set(STDALGO_TEAM_SOURCES_C) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamFind - StdAlgorithmsTeamFindIf - StdAlgorithmsTeamFindIfNot - StdAlgorithmsTeamAllOf - StdAlgorithmsTeamAnyOf - StdAlgorithmsTeamNoneOf - StdAlgorithmsTeamSearchN - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamFind + StdAlgorithmsTeamFindIf + StdAlgorithmsTeamFindIfNot + StdAlgorithmsTeamAllOf + StdAlgorithmsTeamAnyOf + StdAlgorithmsTeamNoneOf + StdAlgorithmsTeamSearchN +) list(APPEND STDALGO_TEAM_SOURCES_C Test${Name}.cpp) endforeach() @@ -321,13 +268,9 @@ endforeach() # std team B # ------------------------------------------ set(STDALGO_TEAM_SOURCES_B) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamEqual - StdAlgorithmsTeamSearch - StdAlgorithmsTeamFindEnd - StdAlgorithmsTeamFindFirstOf - ) +foreach(Name StdAlgorithmsCommon StdAlgorithmsTeamEqual StdAlgorithmsTeamSearch StdAlgorithmsTeamFindEnd + StdAlgorithmsTeamFindFirstOf +) list(APPEND STDALGO_TEAM_SOURCES_B Test${Name}.cpp) endforeach() @@ -335,34 +278,33 @@ endforeach() # std team A # ------------------------------------------ set(STDALGO_TEAM_SOURCES_A) -foreach(Name - StdAlgorithmsCommon - StdAlgorithmsTeamAdjacentFind - StdAlgorithmsTeamCount - StdAlgorithmsTeamCountIf - StdAlgorithmsTeamForEach - StdAlgorithmsTeamForEachN - StdAlgorithmsTeamLexicographicalCompare - StdAlgorithmsTeamMismatch - ) +foreach( + Name + StdAlgorithmsCommon + StdAlgorithmsTeamAdjacentFind + StdAlgorithmsTeamCount + StdAlgorithmsTeamCountIf + StdAlgorithmsTeamForEach + StdAlgorithmsTeamForEachN + StdAlgorithmsTeamLexicographicalCompare + StdAlgorithmsTeamMismatch +) list(APPEND STDALGO_TEAM_SOURCES_A Test${Name}.cpp) endforeach() # FIXME_OPENMPTARGET - remove sort test as it leads to ICE with clang/16 and above at compile time. -if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 16.0.0) - list(REMOVE_ITEM ALGO_SORT_SOURCES - TestSort.cpp - ) +if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL "Clang" AND KOKKOS_CXX_COMPILER_VERSION + VERSION_GREATER_EQUAL 16.0.0 +) + list(REMOVE_ITEM ALGO_SORT_SOURCES TestSort.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget because in these cases # the impl needs to use either Kokkos or tailored reducers # which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L - TestStdAlgorithmsTeamIsPartitioned.cpp - TestStdAlgorithmsTeamPartitionPoint.cpp - TestStdAlgorithmsTeamPartitionCopy.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_L TestStdAlgorithmsTeamIsPartitioned.cpp + TestStdAlgorithmsTeamPartitionPoint.cpp TestStdAlgorithmsTeamPartitionCopy.cpp ) endif() @@ -370,7 +312,9 @@ endif() # in these cases the impl needs to use either Kokkos or # tailored reducers which results in runtime memory errors. if(KOKKOS_ENABLE_OPENMPTARGET) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_C + list( + REMOVE_ITEM + STDALGO_TEAM_SOURCES_C TestStdAlgorithmsTeamFind.cpp TestStdAlgorithmsTeamFindIf.cpp TestStdAlgorithmsTeamFindIfNot.cpp @@ -386,35 +330,20 @@ endif() # FRIZZI: 04/26/2023: not sure if the compilation error is still applicable # but we conservatively leave this guard on if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Sort - SOURCES - UnitTestMain.cpp - TestStdAlgorithmsCommon.cpp - ${ALGO_SORT_SOURCES} + kokkos_add_executable_and_test( + UnitTest_Sort SOURCES UnitTestMain.cpp TestStdAlgorithmsCommon.cpp ${ALGO_SORT_SOURCES} ) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - UnitTest_Random - SOURCES - UnitTestMain.cpp - ${ALGO_RANDOM_SOURCES} - ) + kokkos_add_executable_and_test(UnitTest_Random SOURCES UnitTestMain.cpp ${ALGO_RANDOM_SOURCES}) endif() # FIXME_OPENMPTARGET: These tests cause internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM) - list(REMOVE_ITEM STDALGO_SOURCES_D - TestStdAlgorithmsCopyIf.cpp - TestStdAlgorithmsRemoveCopy.cpp - TestStdAlgorithmsUnique.cpp - TestStdAlgorithmsUniqueCopy.cpp - ) - list(REMOVE_ITEM STDALGO_SOURCES_E - TestStdAlgorithmsExclusiveScan.cpp - TestStdAlgorithmsInclusiveScan.cpp + list(REMOVE_ITEM STDALGO_SOURCES_D TestStdAlgorithmsCopyIf.cpp TestStdAlgorithmsRemoveCopy.cpp + TestStdAlgorithmsUnique.cpp TestStdAlgorithmsUniqueCopy.cpp ) + list(REMOVE_ITEM STDALGO_SOURCES_E TestStdAlgorithmsExclusiveScan.cpp TestStdAlgorithmsInclusiveScan.cpp) endif() # FIXME_OPENMPTARGET remove tests for OpenMPTarget @@ -422,48 +351,31 @@ endif() if(KOKKOS_ENABLE_OPENMPTARGET) # the following use either Kokkos or tailored reducers # which results in runtime memory errors. - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B - TestStdAlgorithmsTeamFindEnd.cpp - TestStdAlgorithmsTeamFindFirstOf.cpp - TestStdAlgorithmsTeamSearch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_B TestStdAlgorithmsTeamFindEnd.cpp TestStdAlgorithmsTeamFindFirstOf.cpp + TestStdAlgorithmsTeamSearch.cpp ) - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A - TestStdAlgorithmsTeamAdjacentFind.cpp - TestStdAlgorithmsTeamLexicographicalCompare.cpp - TestStdAlgorithmsTeamMismatch.cpp + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_A TestStdAlgorithmsTeamAdjacentFind.cpp + TestStdAlgorithmsTeamLexicographicalCompare.cpp TestStdAlgorithmsTeamMismatch.cpp ) # this causes an illegal memory access if team_members_have_matching_result # is called - list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M - TestStdAlgorithmsTeamTransformBinaryOp.cpp - ) + list(REMOVE_ITEM STDALGO_TEAM_SOURCES_M TestStdAlgorithmsTeamTransformBinaryOp.cpp) endif() foreach(ID A;B;C;D;E) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_SOURCES_${ID}} - ) + kokkos_add_executable_and_test(AlgorithmsUnitTest_StdSet_${ID} SOURCES UnitTestMain.cpp ${STDALGO_SOURCES_${ID}}) endforeach() foreach(ID A;B;C;D;E;F;G;H;I;L;M;P;Q) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - AlgorithmsUnitTest_StdSet_Team_${ID} - SOURCES - UnitTestMain.cpp - ${STDALGO_TEAM_SOURCES_${ID}} - ) + kokkos_add_executable_and_test( + AlgorithmsUnitTest_StdSet_Team_${ID} SOURCES UnitTestMain.cpp ${STDALGO_TEAM_SOURCES_${ID}} + ) endforeach() # FIXME_OPENMPTARGET This test causes internal compiler errors as of 09/01/22 # when compiling for Intel's Xe-HP GPUs. if(NOT (KOKKOS_ENABLE_OPENMPTARGET AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM)) - KOKKOS_ADD_EXECUTABLE( - AlgorithmsUnitTest_StdAlgoCompileOnly - SOURCES TestStdAlgorithmsCompileOnly.cpp - ) + kokkos_add_executable(AlgorithmsUnitTest_StdAlgoCompileOnly SOURCES TestStdAlgorithmsCompileOnly.cpp) endif() diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp index dd3569e671..bb074f2480 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortA.hpp @@ -31,13 +31,13 @@ struct bin3d_is_sorted_struct { using value_type = unsigned int; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; int max_bins; Scalar min; Scalar max; - bin3d_is_sorted_struct(Kokkos::View keys_, + bin3d_is_sorted_struct(Kokkos::View keys_, int max_bins_, Scalar min_, Scalar max_) : keys(keys_), max_bins(max_bins_), min(min_), max(max_) {} KOKKOS_INLINE_FUNCTION @@ -65,9 +65,9 @@ struct sum3D { using value_type = double; using execution_space = ExecutionSpace; - Kokkos::View keys; + Kokkos::View keys; - sum3D(Kokkos::View keys_) : keys(keys_) {} + sum3D(Kokkos::View keys_) : keys(keys_) {} KOKKOS_INLINE_FUNCTION void operator()(int i, double& count) const { count += keys(i, 0); @@ -77,8 +77,8 @@ struct sum3D { }; template -void test_3D_sort_impl(unsigned int n) { - using KeyViewType = Kokkos::View; +void test_3D_sort_impl(size_t n) { + using KeyViewType = Kokkos::View; KeyViewType keys("Keys", n * n * n); @@ -207,7 +207,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -219,6 +219,10 @@ void test_sort_integer_overflow() { } // namespace BinSortSetA TEST(TEST_CATEGORY, BinSortGenericTests) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -246,11 +250,11 @@ TEST(TEST_CATEGORY, BinSortEmptyView) { // does not matter if we use int or something else Kokkos::View v("v", 0); - // test all exposed public sort methods - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(v, 0, 0)); - ASSERT_NO_THROW(Sorter.sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Sorter.sort(v)); + // test all exposed public sort methods are callable and do not throw + Sorter.sort(ExecutionSpace(), v, 0, 0); + Sorter.sort(v, 0, 0); + Sorter.sort(ExecutionSpace(), v); + Sorter.sort(v); } TEST(TEST_CATEGORY, BinSortEmptyKeysView) { @@ -263,7 +267,26 @@ TEST(TEST_CATEGORY, BinSortEmptyKeysView) { BinOp_t binOp(5, 0, 10); Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp); - ASSERT_NO_THROW(Sorter.create_permute_vector(ExecutionSpace{})); + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw +} + +// BinSort may delegate sorting within bins to std::sort when running on host +// and having a sufficiently large number of items within a single bin (10 by +// default). Test that this is done without undefined behavior when accessing +// the boundaries of the bin. Should be used in conjunction with a memory +// sanitizer or bounds check. +TEST(TEST_CATEGORY, BinSort_issue_7221) { + using ExecutionSpace = TEST_EXECSPACE; + + using KeyViewType = Kokkos::View; + KeyViewType kv("kv", 11); + + using BinOp_t = Kokkos::BinOp1D; + BinOp_t binOp(1, -10, 10); + Kokkos::BinSort Sorter(ExecutionSpace{}, kv, binOp, + /*sort_within_bins*/ true); + + Sorter.create_permute_vector(ExecutionSpace{}); // does not throw } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp index a90224bf31..d11b53a9a6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestBinSortB.hpp @@ -185,6 +185,10 @@ void run_for_rank2() { } // namespace BinSortSetB TEST(TEST_CATEGORY, BinSortUnsignedKeyLayoutStrideValues) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExeSpace = TEST_EXECSPACE; using key_type = unsigned; BinSortSetB::run_for_rank1(); diff --git a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp index 1b7a3f48fc..cd57fd23ec 100644 --- a/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestNestedSort.hpp @@ -386,6 +386,11 @@ void test_nested_sort_by_key(unsigned int N, KeyType minKey, KeyType maxKey, } // namespace NestedSortImpl TEST(TEST_CATEGORY, NestedSort) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; NestedSortImpl::test_nested_sort(171, 0U, UINT_MAX); NestedSortImpl::test_nested_sort(42, -1e6f, 1e6f); @@ -394,6 +399,11 @@ TEST(TEST_CATEGORY, NestedSort) { } TEST(TEST_CATEGORY, NestedSortByKey) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; // Second/third template arguments are key and value respectively. diff --git a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp index 472af1403b..6960b912d0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandom.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandom.hpp @@ -542,6 +542,11 @@ void test_duplicate_stream() { } // namespace AlgoRandomImpl TEST(TEST_CATEGORY, Random_XorShift64) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif + using ExecutionSpace = TEST_EXECSPACE; #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ @@ -562,6 +567,10 @@ TEST(TEST_CATEGORY, Random_XorShift64) { TEST(TEST_CATEGORY, Random_XorShift1024_0) { using ExecutionSpace = TEST_EXECSPACE; + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif #if defined(KOKKOS_ENABLE_SYCL) || defined(KOKKOS_ENABLE_CUDA) || \ defined(KOKKOS_ENABLE_HIP) @@ -589,7 +598,7 @@ TEST(TEST_CATEGORY, Multi_streams) { #endif #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) - if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { GTEST_SKIP() << "Failing on NVIDIA GPUs"; // FIXME_SYCL } #endif diff --git a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp index 7d484136b6..5ab348cb19 100644 --- a/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestRandomAccessIterator.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct random_access_iterator_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); @@ -264,6 +264,37 @@ TEST_F(random_access_iterator_test, traits_helpers) { static_assert(KE::Impl::are_iterators_v); static_assert(KE::Impl::are_random_access_iterators_v); static_assert(!KE::Impl::are_iterators_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + static_assert( + std::is_same_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestSort.hpp b/lib/kokkos/algorithms/unit_tests/TestSort.hpp index 968fb8950b..5ea88ae5d6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSort.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSort.hpp @@ -197,7 +197,7 @@ void test_sort_integer_overflow() { // array with two extrema in reverse order to expose integer overflow bug in // bin calculation T a[2] = {Kokkos::Experimental::finite_max::value, - Kokkos::Experimental::finite_min::value}; + Kokkos::Experimental::finite_min::value}; auto vd = Kokkos::create_mirror_view_and_copy( ExecutionSpace(), Kokkos::View(a)); Kokkos::sort(vd); @@ -209,6 +209,10 @@ void test_sort_integer_overflow() { } // namespace SortImpl TEST(TEST_CATEGORY, SortUnsignedValueType) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; using key_type = unsigned; constexpr int N = 171; @@ -224,14 +228,19 @@ TEST(TEST_CATEGORY, SortUnsignedValueType) { } TEST(TEST_CATEGORY, SortEmptyView) { + // FIXME_OPENMPTARGET - causes runtime failure with CrayClang compiler +#if defined(KOKKOS_COMPILER_CRAY_LLVM) && defined(KOKKOS_ENABLE_OPENMPTARGET) + GTEST_SKIP() << "known to fail with OpenMPTarget+Cray LLVM"; +#endif using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else Kokkos::View v("v", 0); + // checking that it does not throw // TODO check the synchronous behavior of the calls below - ASSERT_NO_THROW(Kokkos::sort(ExecutionSpace(), v)); - ASSERT_NO_THROW(Kokkos::sort(v)); + Kokkos::sort(ExecutionSpace(), v); + Kokkos::sort(v); } } // namespace Test diff --git a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp index 9e5bd4a574..44abe4e73a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestSortByKey.hpp @@ -83,8 +83,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyView) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } // Test #7036 @@ -95,8 +95,8 @@ TEST(TEST_CATEGORY, SortByKeyEmptyViewHost) { Kokkos::View keys("keys", 0); Kokkos::View values("values", 0); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(ExecutionSpace(), keys, values); } TEST(TEST_CATEGORY, SortByKey) { @@ -183,12 +183,12 @@ TEST(TEST_CATEGORY, SortByKeyStaticExtents) { Kokkos::View keys("keys"); Kokkos::View values_static("values_static"); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_static)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_static); Kokkos::View values_dynamic("values_dynamic", 10); - ASSERT_NO_THROW( - Kokkos::Experimental::sort_by_key(space, keys, values_dynamic)); + // checking that it does not throw + Kokkos::Experimental::sort_by_key(space, keys, values_dynamic); } template @@ -234,7 +234,9 @@ TEST(TEST_CATEGORY, SortByKeyWithStrides) { ASSERT_EQ(sort_fails, 0u); } -TEST(TEST_CATEGORY, SortByKeyKeysLargerThanValues) { +TEST(TEST_CATEGORY_DEATH, SortByKeyKeysLargerThanValues) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + using ExecutionSpace = TEST_EXECSPACE; // does not matter if we use int or something else diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp index 75ad533f6e..208b46b15f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentDifference.cpp @@ -96,7 +96,7 @@ void fill_view(DestViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, aux_v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp index fa4ff48dbe..d8b80675c9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsAdjacentFind.cpp @@ -173,7 +173,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -243,7 +243,7 @@ void run_single_scenario(const InfoType& scenario_info, Args... args) { { auto res_it = KE::adjacent_find(exespace(), KE::cbegin(view), - KE::cend(view), args...); + KE::cend(view), args...); const auto my_diff = res_it - KE::cbegin(view); verify(my_diff, view, args...); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp index 67052e2f9d..dadce2d474 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCommon.hpp @@ -534,10 +534,10 @@ void fill_views_inc(ViewType view, ViewHostType host_view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); auto view_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), view); for (std::size_t i = 0; i < view_h.extent(0); i++) { @@ -546,10 +546,10 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> verify_values(ValueType expected, const ViewType view) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of view and reference value"); using non_strided_view_t = Kokkos::View; @@ -566,11 +566,11 @@ verify_values(ValueType expected, const ViewType view) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); auto expected_h = Kokkos::create_mirror_view_and_copy(Kokkos::HostSpace(), expected); @@ -583,11 +583,11 @@ compare_views(ViewType1 expected, const ViewType2 actual) { } template -std::enable_if_t::value> +std::enable_if_t> compare_views(ViewType1 expected, const ViewType2 actual) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of expected and actual view"); using non_strided_view_t = Kokkos::View; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp index 2a4525a8c3..923ea970f9 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsConstraints.cpp @@ -81,7 +81,7 @@ TEST(std_algorithms, is_admissible_to_std_algorithms) { strided_view_3d_t>::value); } -TEST(std_algorithms, expect_no_overlap) { +TEST(std_algorithms_DeathTest, expect_no_overlap) { namespace KE = Kokkos::Experimental; using value_type = double; @@ -104,6 +104,8 @@ TEST(std_algorithms, expect_no_overlap) { // Overlapping because iterators are identical #if defined(KOKKOS_ENABLE_DEBUG) + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + auto first_s = KE::begin(static_view_1d); auto last_s = first_s + extent0; EXPECT_DEATH({ KE::Impl::expect_no_overlap(first_s, last_s, first_s); }, @@ -148,8 +150,7 @@ TEST(std_algorithms, expect_no_overlap) { auto last_st0 = first_st0 + strided_view_1d_0.extent(0); auto first_st1 = KE::begin(strided_view_1d_1); // [3, 15) // Does not overlap since offset (=3) is not divisible by stride (=2) - EXPECT_NO_THROW( - { KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); }); + KE::Impl::expect_no_overlap(first_st0, last_st0, first_st1); // Iterating over the same range without overlapping Kokkos::View static_view_2d{ @@ -160,9 +161,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_s0 = sub_first_s0 + sub_static_view_1d_0.extent(0); auto sub_first_s1 = KE::begin(sub_static_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); - }); + KE::Impl::expect_no_overlap(sub_first_s0, sub_last_s0, sub_first_s1); Kokkos::View dynamic_view_2d{ "std-algo-test-2d-contiguous-view-dynamic", 2, extent0}; @@ -172,9 +171,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_d0 = sub_first_d0 + sub_dynamic_view_1d_0.extent(0); auto sub_first_d1 = KE::begin(sub_dynamic_view_1d_1); // 1, 3, 5, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); - }); + KE::Impl::expect_no_overlap(sub_first_d0, sub_last_d0, sub_first_d1); Kokkos::LayoutStride layout2d{2, 3, extent0, 2 * 3}; Kokkos::View strided_view_2d{ @@ -185,9 +182,7 @@ TEST(std_algorithms, expect_no_overlap) { auto sub_last_st0 = sub_first_st0 + sub_strided_view_1d_0.extent(0); auto sub_first_st1 = KE::begin(sub_strided_view_1d_1); // 1, 7, 13, ... - EXPECT_NO_THROW({ - KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); - }); + KE::Impl::expect_no_overlap(sub_first_st0, sub_last_st0, sub_first_st1); } } // namespace stdalgos diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp index 5778e37be0..7c9e8f84bf 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsCopyIf.cpp @@ -107,7 +107,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name, } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -202,7 +202,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } @@ -224,7 +224,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if(exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } @@ -233,7 +233,7 @@ void run_single_scenario(const InfoType& scenario_info) { auto n = fill_view(view_from, name, pred); auto view_dest = create_view(Tag{}, view_ext, "copy_if_dest"); auto rit = KE::copy_if("label", exespace(), KE::cbegin(view_from), - KE::cend(view_from), KE::begin(view_dest), pred); + KE::cend(view_from), KE::begin(view_dest), pred); verify_data(name, view_from, view_dest, pred); ASSERT_EQ(rit, (KE::begin(view_dest) + n)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp index b364c53a88..a85e63fe34 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsExclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp index 793b98a67f..b24730ff00 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsForEach.cpp @@ -55,7 +55,6 @@ void test_for_each(const ViewType view) { std::for_each(KE::begin(expected), KE::end(expected), non_mod_functor); compare_views(expected, view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) const auto mod_lambda = KOKKOS_LAMBDA(value_t & i) { ++i; }; // pass view, lambda takes non-const ref @@ -79,7 +78,6 @@ void test_for_each(const ViewType view) { KE::for_each(exespace(), KE::cbegin(view), KE::cend(view), non_mod_lambda); std::for_each(KE::cbegin(expected), KE::cend(expected), non_mod_lambda); compare_views(expected, view); -#endif } // std::for_each_n is C++17, so we cannot compare results directly diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp index 8dbd6cd7e3..2b3361743e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsHelperFunctors.hpp @@ -104,7 +104,7 @@ struct AssignIndexFunctor { template struct IsEvenFunctor { - static_assert(std::is_integral::value, + static_assert(std::is_integral_v, "IsEvenFunctor uses operator%, so ValueType must be int"); KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp index a08a737210..b4f40b4651 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsInclusiveScan.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp index 75d4f0afeb..18928a3526 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSorted.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -122,7 +122,8 @@ bool compute_gold(const std::string& name) { } else if (name == "large-b") { return false; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return false; // unreachable } } @@ -154,7 +155,7 @@ void run_single_scenario(const InfoType& scenario_info) { resultsB[0] = KE::is_sorted(exespace(), KE::cbegin(view), KE::cend(view), comp); resultsB[1] = KE::is_sorted("label", exespace(), KE::cbegin(view), - KE::cend(view), comp); + KE::cend(view), comp); resultsB[2] = KE::is_sorted(exespace(), view, comp); resultsB[3] = KE::is_sorted("label", exespace(), view, comp); const auto allB = std::all_of(resultsB.cbegin(), resultsB.cend(), diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp index 29ac7cc9bc..8327bfe13c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsIsSortedUntil.cpp @@ -92,7 +92,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -123,7 +123,8 @@ auto compute_gold(ViewType view, const std::string& name) { } else if (name == "large-b") { return KE::begin(view) + 156; } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); + return KE::end(view); // unreachable } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp index f3b3e269c4..df5df756d2 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMismatch.cpp @@ -86,7 +86,7 @@ void run_single_scenario(ViewType view1, ViewType view2, v2_h(ext2 / 2) = -5; } } else { - throw std::runtime_error("Kokkos: stdalgo: test: mismatch: Invalid string"); + FAIL() << "Kokkos: stdalgo: test: mismatch: Invalid string"; } Kokkos::deep_copy(aux_view1, v1_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp index 1b1a02f39c..6918185bc0 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModOps.cpp @@ -48,7 +48,7 @@ struct MyMovableType { TEST(std_algorithms_mod_ops_test, move) { MyMovableType a; using move_t = decltype(std::move(a)); - static_assert(std::is_rvalue_reference::value); + static_assert(std::is_rvalue_reference_v); // move constr MyMovableType b(std::move(a)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp index f80f30797e..42a17d7377 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsModSeqOps.cpp @@ -23,7 +23,7 @@ namespace stdalgos { struct std_algorithms_mod_seq_ops_test : std_algorithms_test { public: - virtual void SetUp() { + void SetUp() override { Kokkos::parallel_for(m_static_view.extent(0), AssignIndexFunctor(m_static_view)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp index b201ab95c1..88e2a68ff1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsMoveBackward.cpp @@ -56,7 +56,7 @@ void run_single_scenario(const InfoType& scenario_info, int apiId) { ASSERT_EQ(dist, 5); } else if (apiId == 1) { auto rit = KE::move_backward("mylabel", exespace(), KE::begin(v), - KE::end(v), KE::end(v2)); + KE::end(v), KE::end(v2)); const int dist = KE::distance(KE::begin(v2), rit); ASSERT_EQ(dist, 5); } else if (apiId == 2) { diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp index a36c9db2b9..e47cacdd7d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsPartitionCopy.cpp @@ -95,7 +95,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -110,9 +110,9 @@ void verify_data(const std::string& name, ResultType my_result, ViewTypeDestFalse view_dest_false, PredType pred) { using value_type = typename ViewTypeFrom::value_type; static_assert( - std::is_same::value); + std::is_same_v); static_assert( - std::is_same::value); + std::is_same_v); const std::size_t ext = view_from.extent(0); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp index c35fc5c24b..f897e9b657 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemove.cpp @@ -99,7 +99,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -147,7 +147,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove(exespace(), KE::begin(view), KE::end(view), - (ValueType)match_value); + (ValueType)match_value); verify_data(data_h, view, rit); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp index 3d7c52108b..3137880ea8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopy.cpp @@ -110,7 +110,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp index cb699aa923..d88ab5473d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveCopyIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp index f06f2234ee..e42788799e 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRemoveIf.cpp @@ -93,7 +93,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -144,7 +144,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if(exespace(), KE::begin(view), KE::end(view), - remove_if_even); + remove_if_even); verify_data(data_h, view, rit, remove_if_even); } @@ -154,7 +154,7 @@ void run_single_scenario(const InfoType& scenario_info) { // make host copy BEFORE running algo auto data_h = create_host_space_copy(view); auto rit = KE::remove_if("label", exespace(), KE::begin(view), - KE::end(view), remove_if_even); + KE::end(view), remove_if_even); verify_data(data_h, view, rit, remove_if_even); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp index a22ab32d76..4596726cf3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplace.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -153,7 +153,7 @@ void verify_data(const std::string& name, ViewType1 test_view, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp index a964ec8e17..b18c859af5 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopy.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp index ceeba88971..82f859bac1 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceCopyIf.cpp @@ -84,7 +84,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp index 802c0093c5..5ae2ff4278 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReplaceIf.cpp @@ -96,7 +96,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp index 6e6ca72783..3c934d6485 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsReverse.cpp @@ -62,7 +62,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp index 5638cbee4a..bf5c2ee782 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotate.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp index d0caca7cea..1a860c58ce 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsRotateCopy.cpp @@ -117,7 +117,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -175,7 +175,7 @@ void run_single_scenario(const InfoType& scenario_info, create_view(Tag{}, view_ext, "rotate_copy_dest"); auto n_it = KE::cbegin(view_from) + rotation_point; auto rit = KE::rotate_copy(exespace(), KE::cbegin(view_from), n_it, - KE::cend(view_from), KE::begin(view_dest)); + KE::cend(view_from), KE::begin(view_dest)); verify_data(view_from, view_dest, rotation_point); ASSERT_EQ(rit, (KE::begin(view_dest) + view_ext)); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp index 021609c444..195f88a0b7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch.cpp @@ -256,7 +256,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t seq_ext, { auto myrit = KE::search(exespace(), KE::cbegin(view), KE::cend(view), - KE::cbegin(s_view), KE::cend(s_view), args...); + KE::cbegin(s_view), KE::cend(s_view), args...); const auto mydiff = myrit - KE::cbegin(view); const auto stddiff = stdrit - KE::cbegin(view_h); ASSERT_EQ(mydiff, stddiff); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp index 53ad8daa2e..79d88bec23 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsSearch_n.cpp @@ -154,7 +154,7 @@ void fill_view(ViewType dest_view, ValueType value, std::size_t count, } else { - throw std::runtime_error("Kokkos: test: search_n: this should not happen"); + FAIL() << "Kokkos: test: search_n: this should not happen"; } Kokkos::deep_copy(aux_view, v_h); @@ -208,7 +208,7 @@ void run_single_scenario(const InfoType& scenario_info, std::size_t count, { auto myrit = KE::search_n("label", exespace(), KE::cbegin(view), - KE::cend(view), count, value, args...); + KE::cend(view), count, value, args...); const auto mydiff = myrit - KE::cbegin(view); ASSERT_EQ(mydiff, stddiff); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp index 0b5fe9216e..12835d5a2f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftLeft.cpp @@ -150,7 +150,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_left or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_left("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp index 8e4ae94375..3e350cf3b3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsShiftRight.cpp @@ -141,7 +141,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right(exespace(), KE::begin(view), KE::end(view), - shift_value); + shift_value); verify_data(rit, view, view_h, shift_value); } @@ -152,7 +152,7 @@ void run_single_scenario(const InfoType& scenario_info, // create host copy BEFORE shift_right or view will be modified auto view_h = create_host_space_copy(view); auto rit = KE::shift_right("label", exespace(), KE::begin(view), - KE::end(view), shift_value); + KE::end(view), shift_value); verify_data(rit, view, view_h, shift_value); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp index c388cadc9b..5a2c046939 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamAdjacentDifference.cpp @@ -62,8 +62,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest)); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -73,8 +73,8 @@ struct TestFunctorA { case 1: { auto it = KE::adjacent_difference(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_binaryOp); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_binaryOp); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp index e24ac37bf0..071ecd5a9a 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp index 7c3c465dc8..3f83ac7404 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopyIf.cpp @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate); + KE::begin(rowDest), predicate); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp index 7cbc788f8e..9b509af55b 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCopy_n.cpp @@ -53,7 +53,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::copy_n(member, KE::begin(myRowViewFrom), m_copyCount, - KE::begin(myRowViewDest)); + KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp index 922424afbd..38df5c30ce 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamCount.cpp @@ -111,7 +111,7 @@ void test_A(const bool searched_value_exist, std::size_t numTeams, using rand_pool = Kokkos::Random_XorShift64_Pool; - rand_pool pool(lowerBound * upperBound); + rand_pool pool(static_cast(lowerBound) * upperBound); if (searched_value_exist) { Kokkos::View randomIndices( diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp index 7cb9851087..0c35c5e599 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamExclusiveScan.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::exclusive_scan(member, KE::cbegin(rowViewSrc), - KE::cend(rowViewSrc), - KE::begin(rowViewDest), initVal); + KE::cend(rowViewSrc), + KE::begin(rowViewDest), initVal); resultDist = KE::distance(KE::begin(rowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp index 430e4917e0..88c5e21f31 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFind.cpp @@ -51,7 +51,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), searchedValue); + KE::cend(myRowViewFrom), searchedValue); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp index 83eca33569..d350bc62cd 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindEnd.cpp @@ -86,9 +86,9 @@ struct TestFunctorA { case 2: { auto it = KE::find_end(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::cbegin(myRowSearchedSeqView), - KE::cend(myRowSearchedSeqView), m_binaryPred); + KE::cend(myRowViewFrom), + KE::cbegin(myRowSearchedSeqView), + KE::cend(myRowSearchedSeqView), m_binaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -99,7 +99,7 @@ struct TestFunctorA { case 3: { auto it = KE::find_end(member, myRowViewFrom, myRowSearchedSeqView, - m_binaryPred); + m_binaryPred); resultDist = KE::distance(KE::begin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp index ee4bbed7a3..70f2be77f6 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIf.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp index b9448c1a3e..873e8faf4c 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamFindIfNot.cpp @@ -70,7 +70,7 @@ struct TestFunctorA { switch (m_apiPick) { case 0: { auto it = KE::find_if_not(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), unaryPred); + KE::cend(myRowViewFrom), unaryPred); resultDist = KE::distance(KE::cbegin(myRowViewFrom), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp index 4b66dd9131..265cdf4746 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamGenerate_n.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::generate_n(member, myRowView, m_count, - GenerateFunctor()); + GenerateFunctor()); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp index 850e80dde1..f76a595b3f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSorted.cpp @@ -62,7 +62,7 @@ struct TestFunctorA { } else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; result = KE::is_sorted(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_returnsView(myRowIndex) = result; }); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp index e3b95527c7..5bc49e4600 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamIsSortedUntil.cpp @@ -61,7 +61,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView)); + KE::cend(myRowView)); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -77,8 +77,8 @@ struct TestFunctorA { else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, KE::cbegin(myRowView), - KE::cend(myRowView), - CustomLessThanComparator{}); + KE::cend(myRowView), + CustomLessThanComparator{}); resultDist = KE::distance(KE::cbegin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -88,7 +88,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::is_sorted_until(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -210,7 +210,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId, stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::is_sorted_until(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp index 283525dbd1..452a48df21 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMaxElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::max_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::max_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp index 8579b48315..2c79370b92 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinElement.cpp @@ -74,7 +74,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto it = KE::min_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -144,7 +144,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::cbegin(myRow), it); } else { auto it = std::min_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance = KE::distance(KE::cbegin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp index 51010fdff5..25a4487855 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMinMaxElement.cpp @@ -84,7 +84,7 @@ struct TestFunctorA { else if (m_apiPick == 3) { using value_type = typename ViewType::value_type; auto itPair = KE::minmax_element(member, myRowView, - CustomLessThanComparator{}); + CustomLessThanComparator{}); resultDist1 = KE::distance(KE::begin(myRowView), itPair.first); resultDist2 = KE::distance(KE::begin(myRowView), itPair.second); @@ -160,7 +160,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } else { auto itPair = std::minmax_element(KE::cbegin(myRow), KE::cend(myRow), - CustomLessThanComparator{}); + CustomLessThanComparator{}); stdDistance[0] = KE::distance(KE::cbegin(myRow), itPair.first); stdDistance[1] = KE::distance(KE::cbegin(myRow), itPair.second); } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp index 1122d6d554..2c445dacf8 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamMove.cpp @@ -50,7 +50,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::move(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), KE::begin(myRowViewDest)); + KE::end(myRowViewFrom), KE::begin(myRowViewDest)); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp index fb9c70391b..2defa1dc6f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemove.cpp @@ -63,7 +63,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove(member, KE::begin(myRowView), KE::end(myRowView), - m_targetValue); + m_targetValue); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp index 6bb0d24998..71a50e39e3 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopy.cpp @@ -67,8 +67,8 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::remove_copy(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), m_targetValue); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), m_targetValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp index cff9aa178a..d5b5304f63 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRemoveCopyIf.cpp @@ -65,8 +65,8 @@ struct TestFunctorA { GreaterThanValueFunctor predicate(m_threshold); if (m_apiPick == 0) { auto it = KE::remove_copy_if(member, KE::cbegin(myRowViewFrom), - KE::cend(myRowViewFrom), - KE::begin(myRowViewDest), predicate); + KE::cend(myRowViewFrom), + KE::begin(myRowViewDest), predicate); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp index 70dbf10574..64f172e401 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopy.cpp @@ -78,7 +78,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy(member, myRowViewFrom, myRowViewDest, - m_targetValue, m_newValue); + m_targetValue, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -172,7 +172,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { auto rowFrom = Kokkos::subview(sourceView_dc_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), targetVal, newVal); + KE::begin(rowDest), targetVal, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp index d0217aed7a..9c3699320d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamReplaceCopyIf.cpp @@ -76,7 +76,7 @@ struct TestFunctorA { }); } else if (m_apiPick == 1) { auto it = KE::replace_copy_if(member, myRowViewFrom, myRowViewDest, - predicate, m_newValue); + predicate, m_newValue); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -151,7 +151,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { Kokkos::subview(cloneOfSourceViewBeforeOp_h, i, Kokkos::ALL()); auto rowDest = Kokkos::subview(stdDestView, i, Kokkos::ALL()); auto it = std::replace_copy_if(KE::cbegin(rowFrom), KE::cend(rowFrom), - KE::begin(rowDest), predicate, newVal); + KE::begin(rowDest), predicate, newVal); const std::size_t stdDistance = KE::distance(KE::begin(rowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp index e865b998f6..51f600faba 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamRotateCopy.cpp @@ -136,7 +136,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, std::size_t pivotShift, auto pivot = KE::cbegin(myRowFrom) + pivotShift; auto it = std::rotate_copy(KE::cbegin(myRowFrom), pivot, - KE::cend(myRowFrom), KE::begin(myRowDest)); + KE::cend(myRowFrom), KE::begin(myRowDest)); const std::size_t stdDistance = KE::distance(KE::begin(myRowDest), it); ASSERT_EQ(stdDistance, distancesView_h(i)); ASSERT_TRUE(intraTeamSentinelView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp index 00a80c5ef0..08ff8fbbca 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamShiftRight.cpp @@ -47,7 +47,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::shift_right(member, KE::begin(myRowView), - KE::end(myRowView), m_shift); + KE::end(myRowView), m_shift); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp index 5fc9612caa..60cb3f0837 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamSwapRanges.cpp @@ -49,7 +49,7 @@ struct TestFunctorA { if (m_apiPick == 0) { auto it = KE::swap_ranges(member, KE::begin(myRowView1), - KE::end(myRowView1), KE::begin(myRowView2)); + KE::end(myRowView1), KE::begin(myRowView2)); resultDist = KE::distance(KE::begin(myRowView2), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp index 0b0d798fd8..78a21c4430 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamTransformInclusiveScan.cpp @@ -91,7 +91,7 @@ struct TestFunctorA { case 1: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp); + m_binaryOp, m_unaryOp); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); @@ -111,7 +111,7 @@ struct TestFunctorA { case 3: { auto it = KE::transform_inclusive_scan(member, srcRow, destRow, - m_binaryOp, m_unaryOp, initVal); + m_binaryOp, m_unaryOp, initVal); resultDist = KE::distance(firstDest, it); Kokkos::single(Kokkos::PerTeam(member), [=, *this] { m_distancesView(rowIndex) = resultDist; }); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp index c46146e0a8..cef0f7c13d 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUnique.cpp @@ -58,7 +58,7 @@ struct TestFunctorA { } else if (m_apiPick == 2) { using value_type = typename ViewType::value_type; auto it = KE::unique(member, KE::begin(myRowView), KE::end(myRowView), - CustomEqualityComparator{}); + CustomEqualityComparator{}); resultDist = KE::distance(KE::begin(myRowView), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -138,7 +138,7 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { stdDistance = KE::distance(KE::begin(myRow), it); } else { auto it = std::unique(KE::begin(myRow), KE::end(myRow), - CustomEqualityComparator{}); + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRow), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp index 0d3289e196..89ea8154c7 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTeamUniqueCopy.cpp @@ -72,8 +72,8 @@ struct TestFunctorA { using comparator_t = CustomEqualityComparator; auto it = KE::unique_copy(member, KE::begin(myRowViewFrom), - KE::end(myRowViewFrom), - KE::begin(myRowViewDest), comparator_t()); + KE::end(myRowViewFrom), + KE::begin(myRowViewDest), comparator_t()); resultDist = KE::distance(KE::begin(myRowViewDest), it); Kokkos::single(Kokkos::PerTeam(member), [=, *this]() { m_distancesView(myRowIndex) = resultDist; @@ -159,12 +159,12 @@ void test_A(std::size_t numTeams, std::size_t numCols, int apiId) { std::size_t stdDistance = 0; if (apiId <= 1) { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest)); + KE::begin(myRowDest)); stdDistance = KE::distance(KE::begin(myRowDest), it); } else { auto it = std::unique_copy(KE::cbegin(myRowFrom), KE::cend(myRowFrom), - KE::begin(myRowDest), - CustomEqualityComparator{}); + KE::begin(myRowDest), + CustomEqualityComparator{}); stdDistance = KE::distance(KE::begin(myRowDest), it); } ASSERT_EQ(stdDistance, distancesView_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp index fa2804256a..365ca21688 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformExclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -161,7 +161,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp index fb81ae91b0..cc87262147 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsTransformInclusiveScan.cpp @@ -115,7 +115,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); @@ -173,7 +173,7 @@ void verify_data(ViewType1 data_view, // contains data create_mirror_view_and_copy(Kokkos::HostSpace(), test_view_dc); if (test_view_h.extent(0) > 0) { for (std::size_t i = 0; i < test_view_h.extent(0); ++i) { - if (std::is_same::value) { + if (std::is_same_v) { ASSERT_EQ(gold_h(i), test_view_h(i)); } else { const auto error = std::abs(gold_h(i) - test_view_h(i)); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp index 9c5ae0cf8a..6ee93e3d5f 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUnique.cpp @@ -138,7 +138,7 @@ void fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } Kokkos::deep_copy(aux_view, v_h); diff --git a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp index 3cf43ad4db..e3e9696458 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdAlgorithmsUniqueCopy.cpp @@ -146,7 +146,7 @@ std::size_t fill_view(ViewType dest_view, const std::string& name) { } else { - throw std::runtime_error("invalid choice"); + Kokkos::abort("invalid choice"); } Kokkos::deep_copy(aux_view, v_h); @@ -235,7 +235,7 @@ void verify_data(const std::string& name, ViewTypeFrom view_from, } else { - throw std::runtime_error("invalid choice"); + FAIL() << "invalid choice"; } } diff --git a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp index c05006a161..0044b93558 100644 --- a/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp +++ b/lib/kokkos/algorithms/unit_tests/TestStdReducers.cpp @@ -72,7 +72,7 @@ auto create_host_view_with_reduction_order_indices( result(8) = 7; result(9) = 5; } else { - throw std::runtime_error("test: Invalid enum"); + Kokkos::abort("test: Invalid enum"); } return result; @@ -80,7 +80,7 @@ auto create_host_view_with_reduction_order_indices( template auto run_min_or_max_test(ViewType view, StdReducersTestEnumOrder enValue) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; @@ -191,7 +191,7 @@ template void run_min_max_test(ViewType view, StdReducersTestEnumOrder enValue, const ValuesPair gold_values, const IndexPair gold_locs) { - static_assert(std::is_same::value, + static_assert(std::is_same_v, "test is only enabled for HostSpace"); using view_value_type = typename ViewType::value_type; diff --git a/lib/kokkos/appveyor.yml b/lib/kokkos/appveyor.yml deleted file mode 100644 index d0a5645ef7..0000000000 --- a/lib/kokkos/appveyor.yml +++ /dev/null @@ -1,10 +0,0 @@ -image: - - Visual Studio 2019 -clone_folder: c:\projects\source -build_script: -- cmd: >- - mkdir build && - cd build && - cmake c:\projects\source -DKokkos_ENABLE_IMPL_MDSPAN=OFF -DKokkos_ENABLE_TESTS=ON -DCMAKE_CXX_FLAGS="/W0 /EHsc" -DKokkos_ENABLE_DEPRECATED_CODE_4=ON -DKokkos_ENABLE_DEPRECATION_WARNINGS=OFF && - cmake --build . --target install && - ctest -C Debug --output-on-failure diff --git a/lib/kokkos/benchmarks/CMakeLists.txt b/lib/kokkos/benchmarks/CMakeLists.txt index 529ef393d9..968c8ae3bf 100644 --- a/lib/kokkos/benchmarks/CMakeLists.txt +++ b/lib/kokkos/benchmarks/CMakeLists.txt @@ -1,12 +1,12 @@ #FIXME_OPENMPTARGET - compiling in debug mode causes ICE. -KOKKOS_ADD_BENCHMARK_DIRECTORIES(atomic) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gather) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(gups) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(launch_latency) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(stream) -KOKKOS_ADD_BENCHMARK_DIRECTORIES(view_copy_constructor) +kokkos_add_benchmark_directories(atomic) +kokkos_add_benchmark_directories(gather) +kokkos_add_benchmark_directories(gups) +kokkos_add_benchmark_directories(launch_latency) +kokkos_add_benchmark_directories(stream) +kokkos_add_benchmark_directories(view_copy_constructor) #FIXME_OPENMPTARGET - These two benchmarks cause ICE. Commenting them for now but a deeper analysis on the cause and a possible fix will follow. -IF(NOT Kokkos_ENABLE_OPENMPTARGET) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(policy_performance) - KOKKOS_ADD_BENCHMARK_DIRECTORIES(bytes_and_flops) -ENDIF() +if(NOT Kokkos_ENABLE_OPENMPTARGET) + kokkos_add_benchmark_directories(policy_performance) + kokkos_add_benchmark_directories(bytes_and_flops) +endif() diff --git a/lib/kokkos/benchmarks/atomic/CMakeLists.txt b/lib/kokkos/benchmarks/atomic/CMakeLists.txt index 85f7412f49..7fda2bf6f6 100644 --- a/lib/kokkos/benchmarks/atomic/CMakeLists.txt +++ b/lib/kokkos/benchmarks/atomic/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - atomic - SOURCES main.cpp -) +kokkos_add_executable(atomic SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt index 0ce44a6f1a..9c65d06ce2 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt +++ b/lib/kokkos/benchmarks/bytes_and_flops/CMakeLists.txt @@ -1,4 +1,9 @@ -KOKKOS_ADD_EXECUTABLE( +kokkos_add_executable( bytes_and_flops - SOURCES bench_double.cpp bench_float.cpp bench_int32_t.cpp bench_int64_t.cpp main.cpp + SOURCES + bench_double.cpp + bench_float.cpp + bench_int32_t.cpp + bench_int64_t.cpp + main.cpp ) diff --git a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp index 78cfd48eff..762cc988f1 100644 --- a/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp +++ b/lib/kokkos/benchmarks/bytes_and_flops/bench_unroll_stride.hpp @@ -17,9 +17,9 @@ template struct Run { static void run(int N, int K, int R, int F, int T, int S, int Ba, int I) { - Kokkos::View A("A", N, K); - Kokkos::View B("B", N, K); - Kokkos::View C("C", N, K); + Kokkos::View A("A", N, K); + Kokkos::View B("B", N, K); + Kokkos::View C("C", N, K); Kokkos::deep_copy(A, Scalar(1.5)); Kokkos::deep_copy(B, Scalar(2.5)); diff --git a/lib/kokkos/benchmarks/gather/CMakeLists.txt b/lib/kokkos/benchmarks/gather/CMakeLists.txt index 24c7062772..2de1ce85e6 100644 --- a/lib/kokkos/benchmarks/gather/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gather/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gather - SOURCES main.cpp -) +kokkos_add_executable(gather SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/gups/CMakeLists.txt b/lib/kokkos/benchmarks/gups/CMakeLists.txt index 8de5b73cc6..dc70747029 100644 --- a/lib/kokkos/benchmarks/gups/CMakeLists.txt +++ b/lib/kokkos/benchmarks/gups/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - gups - SOURCES gups.cpp -) +kokkos_add_executable(gups SOURCES gups.cpp) diff --git a/lib/kokkos/benchmarks/gups/gups.cpp b/lib/kokkos/benchmarks/gups/gups.cpp index 369052321d..e00f87968b 100644 --- a/lib/kokkos/benchmarks/gups/gups.cpp +++ b/lib/kokkos/benchmarks/gups/gups.cpp @@ -140,7 +140,7 @@ int run_benchmark(const Index indicesCount, const Index dataCount, break; } default: { - throw std::runtime_error("unexpected mode"); + Kokkos::abort("unexpected mode"); } } diff --git a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt index bb14da749d..4775bf2261 100644 --- a/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt +++ b/lib/kokkos/benchmarks/launch_latency/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - launch_latency - SOURCES launch_latency.cpp -) +kokkos_add_executable(launch_latency SOURCES launch_latency.cpp) diff --git a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp index 73b176ab8d..156c29af09 100644 --- a/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp +++ b/lib/kokkos/benchmarks/launch_latency/launch_latency.cpp @@ -254,7 +254,7 @@ int main(int argc, char* argv[]) { else if (i == 3) K = atoi(arg.data()); else { - throw std::runtime_error("unexpected argument!"); + Kokkos::abort("unexpected argument!"); } } else if (arg == "--no-parallel-for") { opts.par_for = false; @@ -265,7 +265,7 @@ int main(int argc, char* argv[]) { } else { std::stringstream ss; ss << "unexpected argument \"" << arg << "\" at position " << i; - throw std::runtime_error(ss.str()); + Kokkos::abort(ss.str().c_str()); } } diff --git a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt index 929b9c9702..4a939775c0 100644 --- a/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt +++ b/lib/kokkos/benchmarks/policy_performance/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - policy_performance - SOURCES main.cpp -) +kokkos_add_executable(policy_performance SOURCES main.cpp) diff --git a/lib/kokkos/benchmarks/stream/CMakeLists.txt b/lib/kokkos/benchmarks/stream/CMakeLists.txt index 0dded6e3a5..b096976c48 100644 --- a/lib/kokkos/benchmarks/stream/CMakeLists.txt +++ b/lib/kokkos/benchmarks/stream/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - stream - SOURCES stream-kokkos.cpp -) +kokkos_add_executable(stream SOURCES stream-kokkos.cpp) diff --git a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt index 50a331b2b3..f7bbc13b6e 100644 --- a/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt +++ b/lib/kokkos/benchmarks/view_copy_constructor/CMakeLists.txt @@ -1,4 +1 @@ -KOKKOS_ADD_EXECUTABLE( - view_copy_constructor - SOURCES view_copy_constructor.cpp -) +kokkos_add_executable(view_copy_constructor SOURCES view_copy_constructor.cpp) diff --git a/lib/kokkos/bin/kokkos_launch_compiler b/lib/kokkos/bin/kokkos_launch_compiler index d1f8896f91..ee3c29e96d 100755 --- a/lib/kokkos/bin/kokkos_launch_compiler +++ b/lib/kokkos/bin/kokkos_launch_compiler @@ -62,7 +62,7 @@ KOKKOS_COMPILER=${1} shift # store the expected C++ compiler -CXX_COMPILER=${1} +CXX_COMPILER=$(which "${1}") # remove the expected C++ compiler from the arguments shift @@ -84,7 +84,7 @@ shift # kokkos_launch_compiler ${KOKKOS_COMPILER} g++ g++ -c file.cpp -o file.o # results in this command being executed: # ${KOKKOS_COMPILER} -c file.cpp -o file.o -if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != "${1}" ]]; then +if [[ "${KOKKOS_DEPENDENCE}" -eq "0" || "${CXX_COMPILER}" != $(which "${1}") ]]; then debug-message "$@" # the command does not depend on Kokkos so just execute the command w/o re-directing to ${KOKKOS_COMPILER} exec "$@" diff --git a/lib/kokkos/cmake/Dependencies.cmake b/lib/kokkos/cmake/Dependencies.cmake index fb1e73b579..2f70c2f038 100644 --- a/lib/kokkos/cmake/Dependencies.cmake +++ b/lib/kokkos/cmake/Dependencies.cmake @@ -1,5 +1,3 @@ -TRIBITS_PACKAGE_DEFINE_DEPENDENCIES( - LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib - ) +tribits_package_define_dependencies(LIB_OPTIONAL_TPLS Pthread CUDA HWLOC DLlib) -TRIBITS_TPL_TENTATIVELY_ENABLE(DLlib) +tribits_tpl_tentatively_enable(DLlib) diff --git a/lib/kokkos/cmake/KokkosCore_config.h.in b/lib/kokkos/cmake/KokkosCore_config.h.in index 08f128f2d1..44f81bb8ce 100644 --- a/lib/kokkos/cmake/KokkosCore_config.h.in +++ b/lib/kokkos/cmake/KokkosCore_config.h.in @@ -24,7 +24,6 @@ #cmakedefine KOKKOS_ENABLE_HIP #cmakedefine KOKKOS_ENABLE_HPX #cmakedefine KOKKOS_ENABLE_SYCL -#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED /* General Settings */ #cmakedefine KOKKOS_ENABLE_CXX17 @@ -40,7 +39,10 @@ #cmakedefine KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY #cmakedefine KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE #cmakedefine KOKKOS_ENABLE_HIP_MULTIPLE_KERNEL_INSTANTIATIONS -#cmakedefine KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#cmakedefine KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC +#cmakedefine KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE +#cmakedefine KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED +#cmakedefine KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE #cmakedefine KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH #cmakedefine KOKKOS_ENABLE_DEBUG #cmakedefine KOKKOS_ENABLE_DEBUG_DUALVIEW_MODIFY_CHECK @@ -80,6 +82,7 @@ #cmakedefine KOKKOS_ARCH_POWER8 #cmakedefine KOKKOS_ARCH_POWER9 #cmakedefine KOKKOS_ARCH_RISCV_SG2042 +#cmakedefine KOKKOS_ARCH_RISCV_RVA22V #cmakedefine KOKKOS_ARCH_INTEL_GEN #cmakedefine KOKKOS_ARCH_INTEL_DG1 #cmakedefine KOKKOS_ARCH_INTEL_GEN9 @@ -118,10 +121,11 @@ #cmakedefine KOKKOS_ARCH_AMD_GFX90A #cmakedefine KOKKOS_ARCH_AMD_GFX940 #cmakedefine KOKKOS_ARCH_AMD_GFX942 +#cmakedefine KOKKOS_ARCH_AMD_GFX942_APU #cmakedefine KOKKOS_ARCH_AMD_GFX1030 #cmakedefine KOKKOS_ARCH_AMD_GFX1100 #cmakedefine KOKKOS_ARCH_AMD_GFX1103 -#cmakedefine KOKKOS_ARCH_AMD_GPU +#cmakedefine KOKKOS_ARCH_AMD_GPU "@KOKKOS_ARCH_AMD_GPU@" #cmakedefine KOKKOS_ARCH_VEGA // deprecated #cmakedefine KOKKOS_ARCH_VEGA906 // deprecated #cmakedefine KOKKOS_ARCH_VEGA908 // deprecated diff --git a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in b/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in deleted file mode 100644 index 626ef5a8eb..0000000000 --- a/lib/kokkos/cmake/KokkosTrilinosConfig.cmake.in +++ /dev/null @@ -1,17 +0,0 @@ -IF (NOT TARGET Kokkos::kokkos) - # Compute the installation prefix relative to this file. - get_filename_component(KOKKOS_IMPORT_PREFIX "${CMAKE_CURRENT_LIST_FILE}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - get_filename_component(KOKKOS_IMPORT_PREFIX "${KOKKOS_IMPORT_PREFIX}" PATH) - if(KOKKOS_IMPORT_PREFIX STREQUAL "/") - set(KOKKOS_IMPORT_PREFIX "") - endif() - add_library(Kokkos::kokkos INTERFACE IMPORTED) - set_target_properties(Kokkos::kokkos PROPERTIES - INTERFACE_LINK_LIBRARIES "@Kokkos_LIBRARIES@;@KOKKOS_LINK_OPTIONS@" - INTERFACE_COMPILE_FEATURES "@KOKKOS_CXX_STANDARD_FEATURE@" - INTERFACE_COMPILE_OPTIONS "@KOKKOS_ALL_COMPILE_OPTIONS@" - INTERFACE_INCLUDE_DIRECTORIES "${KOKKOS_IMPORT_PREFIX}/include" - ) -ENDIF() diff --git a/lib/kokkos/cmake/Modules/CudaToolkit.cmake b/lib/kokkos/cmake/Modules/CudaToolkit.cmake index eda5541f7c..b8ac2048b5 100644 --- a/lib/kokkos/cmake/Modules/CudaToolkit.cmake +++ b/lib/kokkos/cmake/Modules/CudaToolkit.cmake @@ -483,38 +483,40 @@ endif() # Try language- or user-provided path first. if(CUDAToolkit_BIN_DIR) - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${CUDAToolkit_BIN_DIR} NO_DEFAULT_PATH - ) + ) endif() # Search using CUDAToolkit_ROOT -find_program(CUDAToolkit_NVCC_EXECUTABLE +find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ENV CUDA_PATH PATH_SUFFIXES bin ) # If the user specified CUDAToolkit_ROOT but nvcc could not be found, this is an error. -if (NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) +if(NOT CUDAToolkit_NVCC_EXECUTABLE AND (DEFINED CUDAToolkit_ROOT OR DEFINED ENV{CUDAToolkit_ROOT})) # Declare error messages now, print later depending on find_package args. set(fail_base "Could not find nvcc executable in path specified by") set(cuda_root_fail "${fail_base} CUDAToolkit_ROOT=${CUDAToolkit_ROOT}") set(env_cuda_root_fail "${fail_base} environment variable CUDAToolkit_ROOT=$ENV{CUDAToolkit_ROOT}") - if (CUDAToolkit_FIND_REQUIRED) - if (DEFINED CUDAToolkit_ROOT) + if(CUDAToolkit_FIND_REQUIRED) + if(DEFINED CUDAToolkit_ROOT) message(FATAL_ERROR ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(FATAL_ERROR ${env_cuda_root_fail}) endif() else() - if (NOT CUDAToolkit_FIND_QUIETLY) - if (DEFINED CUDAToolkit_ROOT) + if(NOT CUDAToolkit_FIND_QUIETLY) + if(DEFINED CUDAToolkit_ROOT) message(STATUS ${cuda_root_fail}) - elseif (DEFINED ENV{CUDAToolkit_ROOT}) + elseif(DEFINED ENV{CUDAToolkit_ROOT}) message(STATUS ${env_cuda_root_fail}) endif() endif() @@ -535,9 +537,9 @@ endif() # We will also search the default symlink location /usr/local/cuda first since # if CUDAToolkit_ROOT is not specified, it is assumed that the symlinked # directory is the desired location. -if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (UNIX) - if (NOT APPLE) +if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(UNIX) + if(NOT APPLE) set(platform_base "/usr/local/cuda-") else() set(platform_base "/Developer/NVIDIA/CUDA-") @@ -550,10 +552,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) file(GLOB possible_paths "${platform_base}*") # Iterate the glob results and create a descending list. set(possible_versions) - foreach (p ${possible_paths}) + foreach(p ${possible_paths}) # Extract version number from end of string string(REGEX MATCH "[0-9][0-9]?\\.[0-9]$" p_version ${p}) - if (IS_DIRECTORY ${p} AND p_version) + if(IS_DIRECTORY ${p} AND p_version) list(APPEND possible_versions ${p_version}) endif() endforeach() @@ -563,10 +565,10 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # every possible version of CUDA installed, this wouldn't create any # significant overhead. set(versions) - foreach (v ${possible_versions}) + foreach(v ${possible_versions}) list(LENGTH versions num_versions) # First version, nothing to compare with so just append. - if (num_versions EQUAL 0) + if(num_versions EQUAL 0) list(APPEND versions ${v}) else() # Loop through list. Insert at an index when comparison is @@ -574,9 +576,9 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # happen since this came from a glob list of directories. set(i 0) set(early_terminate FALSE) - while (i LESS num_versions) + while(i LESS num_versions) list(GET versions ${i} curr) - if (v VERSION_GREATER curr) + if(v VERSION_GREATER curr) list(INSERT versions ${i} ${v}) set(early_terminate TRUE) break() @@ -584,7 +586,7 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) math(EXPR i "${i} + 1") endwhile() # If it did not get inserted, place it at the end. - if (NOT early_terminate) + if(NOT early_terminate) list(APPEND versions ${v}) endif() endif() @@ -592,17 +594,18 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) # With a descending list of versions, populate possible paths to search. set(search_paths) - foreach (v ${versions}) + foreach(v ${versions}) list(APPEND search_paths "${platform_base}${v}") endforeach() # Force the global default /usr/local/cuda to the front on Unix. - if (UNIX) + if(UNIX) list(INSERT search_paths 0 "/usr/local/cuda") endif() # Now search for nvcc again using the platform default search paths. - find_program(CUDAToolkit_NVCC_EXECUTABLE + find_program( + CUDAToolkit_NVCC_EXECUTABLE NAMES nvcc nvcc.exe PATHS ${search_paths} PATH_SUFFIXES bin @@ -617,8 +620,8 @@ if (NOT CUDAToolkit_NVCC_EXECUTABLE) unset(early_terminate) unset(search_paths) - if (NOT CUDAToolkit_NVCC_EXECUTABLE) - if (CUDAToolkit_FIND_REQUIRED) + if(NOT CUDAToolkit_NVCC_EXECUTABLE) + if(CUDAToolkit_FIND_REQUIRED) message(FATAL_ERROR "Could not find nvcc, please set CUDAToolkit_ROOT.") elseif(NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Could not find nvcc, please set CUDAToolkit_ROOT.") @@ -636,8 +639,7 @@ if(NOT CUDAToolkit_BIN_DIR AND CUDAToolkit_NVCC_EXECUTABLE) unset(cuda_dir) endif() -if(CUDAToolkit_NVCC_EXECUTABLE AND - CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) +if(CUDAToolkit_NVCC_EXECUTABLE AND CUDAToolkit_NVCC_EXECUTABLE STREQUAL CMAKE_CUDA_COMPILER) # Need to set these based off the already computed CMAKE_CUDA_COMPILER_VERSION value # This if statement will always match, but is used to provide variables for MATCH 1,2,3... if(CMAKE_CUDA_COMPILER_VERSION MATCHES [=[([0-9]+)\.([0-9]+)\.([0-9]+)]=]) @@ -648,39 +650,38 @@ if(CUDAToolkit_NVCC_EXECUTABLE AND endif() else() # Compute the version by invoking nvcc - execute_process (COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) + execute_process(COMMAND ${CUDAToolkit_NVCC_EXECUTABLE} "--version" OUTPUT_VARIABLE NVCC_OUT) if(NVCC_OUT MATCHES [=[ V([0-9]+)\.([0-9]+)\.([0-9]+)]=]) set(CUDAToolkit_VERSION_MAJOR "${CMAKE_MATCH_1}") set(CUDAToolkit_VERSION_MINOR "${CMAKE_MATCH_2}") set(CUDAToolkit_VERSION_PATCH "${CMAKE_MATCH_3}") - set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") + set(CUDAToolkit_VERSION "${CMAKE_MATCH_1}.${CMAKE_MATCH_2}.${CMAKE_MATCH_3}") endif() unset(NVCC_OUT) endif() - get_filename_component(CUDAToolkit_ROOT_DIR ${CUDAToolkit_BIN_DIR} DIRECTORY ABSOLUTE) # Handle cross compilation if(CMAKE_CROSSCOMPILING) if(CMAKE_SYSTEM_PROCESSOR STREQUAL "armv7-a") # Support for NVPACK - set (CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") + set(CUDAToolkit_TARGET_NAME "armv7-linux-androideabi") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "arm") # Support for arm cross compilation set(CUDAToolkit_TARGET_NAME "armv7-linux-gnueabihf") elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") # Support for aarch64 cross compilation - if (ANDROID_ARCH_NAME STREQUAL "arm64") + if(ANDROID_ARCH_NAME STREQUAL "arm64") set(CUDAToolkit_TARGET_NAME "aarch64-linux-androideabi") else() set(CUDAToolkit_TARGET_NAME "aarch64-linux") - endif (ANDROID_ARCH_NAME STREQUAL "arm64") + endif(ANDROID_ARCH_NAME STREQUAL "arm64") elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64") - set(CUDAToolkit_TARGET_NAME "x86_64-linux") + set(CUDAToolkit_TARGET_NAME "x86_64-linux") endif() - if (EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") + if(EXISTS "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") set(CUDAToolkit_TARGET_DIR "${CUDAToolkit_ROOT_DIR}/targets/${CUDAToolkit_TARGET_NAME}") # add known CUDA target root path to the set of directories we search for programs, libraries and headers list(PREPEND CMAKE_FIND_ROOT_PATH "${CUDAToolkit_TARGET_DIR}") @@ -702,25 +703,16 @@ else() set(_CUDAToolkit_Pop_Prefix True) endif() - # Find the include/ directory -find_path(CUDAToolkit_INCLUDE_DIR - NAMES cuda_runtime.h -) +find_path(CUDAToolkit_INCLUDE_DIR NAMES cuda_runtime.h) # And find the CUDA Runtime Library libcudart -find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64 lib/x64 -) -if (NOT CUDA_CUDART) - find_library(CUDA_CUDART - NAMES cudart - PATH_SUFFIXES lib64/stubs lib/x64/stubs - ) +find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64 lib/x64) +if(NOT CUDA_CUDART) + find_library(CUDA_CUDART NAMES cudart PATH_SUFFIXES lib64/stubs lib/x64/stubs) endif() -if (NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) +if(NOT CUDA_CUDART AND NOT CUDAToolkit_FIND_QUIETLY) message(STATUS "Unable to find cudart library.") endif() @@ -733,24 +725,17 @@ endif() #----------------------------------------------------------------------------- # Perform version comparison and validate all required variables are set. include(FindPackageHandleStandardArgs) -find_package_handle_standard_args(CUDAToolkit - REQUIRED_VARS - CUDAToolkit_INCLUDE_DIR - CUDA_CUDART - CUDAToolkit_NVCC_EXECUTABLE - VERSION_VAR - CUDAToolkit_VERSION +find_package_handle_standard_args( + CUDAToolkit REQUIRED_VARS CUDAToolkit_INCLUDE_DIR CUDA_CUDART CUDAToolkit_NVCC_EXECUTABLE + VERSION_VAR CUDAToolkit_VERSION ) -mark_as_advanced(CUDA_CUDART - CUDAToolkit_INCLUDE_DIR - CUDAToolkit_NVCC_EXECUTABLE - ) +mark_as_advanced(CUDA_CUDART CUDAToolkit_INCLUDE_DIR CUDAToolkit_NVCC_EXECUTABLE) #----------------------------------------------------------------------------- # Construct result variables if(CUDAToolkit_FOUND) - set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) - get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) + set(CUDAToolkit_INCLUDE_DIRS ${CUDAToolkit_INCLUDE_DIR}) + get_filename_component(CUDAToolkit_LIBRARY_DIR ${CUDA_CUDART} DIRECTORY ABSOLUTE) endif() #----------------------------------------------------------------------------- @@ -762,27 +747,26 @@ if(CUDAToolkit_FOUND) set(search_names ${lib_name} ${arg_ALT}) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH - PATH_SUFFIXES nvidia/current lib64 lib/x64 lib - ${arg_EXTRA_PATH_SUFFIXES} + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH + PATH_SUFFIXES nvidia/current lib64 lib/x64 lib ${arg_EXTRA_PATH_SUFFIXES} ) # Don't try any stub directories intil we have exhausted all other # search locations. if(NOT CUDA_${lib_name}_LIBRARY) - find_library(CUDA_${lib_name}_LIBRARY + find_library( + CUDA_${lib_name}_LIBRARY NAMES ${search_names} - HINTS ${CUDAToolkit_LIBRARY_DIR} - ENV CUDA_PATH + HINTS ${CUDAToolkit_LIBRARY_DIR} ENV CUDA_PATH PATH_SUFFIXES lib64/stubs lib/x64/stubs lib/stubs stubs ) endif() mark_as_advanced(CUDA_${lib_name}_LIBRARY) - if (NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) + if(NOT TARGET CUDA::${lib_name} AND CUDA_${lib_name}_LIBRARY) add_library(CUDA::${lib_name} IMPORTED INTERFACE) target_include_directories(CUDA::${lib_name} SYSTEM INTERFACE "${CUDAToolkit_INCLUDE_DIRS}") target_link_libraries(CUDA::${lib_name} INTERFACE "${CUDA_${lib_name}_LIBRARY}") @@ -800,16 +784,15 @@ if(CUDAToolkit_FOUND) target_link_directories(CUDA::toolkit INTERFACE "${CUDAToolkit_LIBRARY_DIR}") endif() - _CUDAToolkit_find_and_add_import_lib(cuda_driver ALT cuda) + _cudatoolkit_find_and_add_import_lib(cuda_driver ALT cuda) - _CUDAToolkit_find_and_add_import_lib(cudart) - _CUDAToolkit_find_and_add_import_lib(cudart_static) + _cudatoolkit_find_and_add_import_lib(cudart) + _cudatoolkit_find_and_add_import_lib(cudart_static) # setup dependencies that are required for cudart_static when building # on linux. These are generally only required when using the CUDA toolkit # when CUDA language is disabled - if(NOT TARGET CUDA::cudart_static_deps - AND TARGET CUDA::cudart_static) + if(NOT TARGET CUDA::cudart_static_deps AND TARGET CUDA::cudart_static) add_library(CUDA::cudart_static_deps IMPORTED INTERFACE) target_link_libraries(CUDA::cudart_static INTERFACE CUDA::cudart_static_deps) @@ -831,55 +814,64 @@ if(CUDAToolkit_FOUND) endif() endif() - _CUDAToolkit_find_and_add_import_lib(culibos) # it's a static library - foreach (cuda_lib cublas cufft curand cusparse nppc nvjpeg) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) + _cudatoolkit_find_and_add_import_lib(culibos) # it's a static library + foreach(cuda_lib cublas cufft curand cusparse nppc nvjpeg) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS culibos) endforeach() # cuFFTW depends on cuFFT - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft) - _CUDAToolkit_find_and_add_import_lib(cufftw DEPS cufft_static) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft) + _cudatoolkit_find_and_add_import_lib(cufftw DEPS cufft_static) # cuSOLVER depends on cuBLAS, and cuSPARSE - _CUDAToolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) - _CUDAToolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) + _cudatoolkit_find_and_add_import_lib(cusolver DEPS cublas cusparse) + _cudatoolkit_find_and_add_import_lib(cusolver_static DEPS cublas_static cusparse_static culibos) # nvGRAPH depends on cuRAND, and cuSOLVER. - _CUDAToolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) - _CUDAToolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) + _cudatoolkit_find_and_add_import_lib(nvgraph DEPS curand cusolver) + _cudatoolkit_find_and_add_import_lib(nvgraph_static DEPS curand_static cusolver_static) # Process the majority of the NPP libraries. - foreach (cuda_lib nppial nppicc nppidei nppif nppig nppim nppist nppitc npps nppicom nppisu) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) - _CUDAToolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) + foreach( + cuda_lib + nppial + nppicc + nppidei + nppif + nppig + nppim + nppist + nppitc + npps + nppicom + nppisu + ) + _cudatoolkit_find_and_add_import_lib(${cuda_lib} DEPS nppc) + _cudatoolkit_find_and_add_import_lib(${cuda_lib}_static DEPS nppc_static) endforeach() - _CUDAToolkit_find_and_add_import_lib(cupti - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(cupti_static - EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ - ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) + _cudatoolkit_find_and_add_import_lib(cupti_static EXTRA_PATH_SUFFIXES ../extras/CUPTI/lib64/ ../extras/CUPTI/lib/) - _CUDAToolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) + _cudatoolkit_find_and_add_import_lib(nvrtc DEPS cuda_driver) - _CUDAToolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) + _cudatoolkit_find_and_add_import_lib(nvml ALT nvidia-ml nvml) if(WIN32) # nvtools can be installed outside the CUDA toolkit directory # so prefer the NVTOOLSEXT_PATH windows only environment variable # In addition on windows the most common name is nvToolsExt64_1 - find_library(CUDA_nvToolsExt_LIBRARY + find_library( + CUDA_nvToolsExt_LIBRARY NAMES nvToolsExt64_1 nvToolsExt64 nvToolsExt - PATHS ENV NVTOOLSEXT_PATH - ENV CUDA_PATH + PATHS ENV NVTOOLSEXT_PATH ENV CUDA_PATH PATH_SUFFIXES lib/x64 lib ) endif() - _CUDAToolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) + _cudatoolkit_find_and_add_import_lib(nvToolsExt ALT nvToolsExt64) - _CUDAToolkit_find_and_add_import_lib(OpenCL) + _cudatoolkit_find_and_add_import_lib(OpenCL) endif() if(_CUDAToolkit_Pop_ROOT_PATH) diff --git a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake index 445f4e93a5..3a6a826197 100644 --- a/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLCUDA.cmake @@ -1,44 +1,40 @@ -IF (NOT CUDAToolkit_ROOT) - IF (NOT CUDA_ROOT) - SET(CUDA_ROOT $ENV{CUDA_ROOT}) - ENDIF() - IF(CUDA_ROOT) - SET(CUDAToolkit_ROOT ${CUDA_ROOT}) - ENDIF() -ENDIF() +if(NOT CUDAToolkit_ROOT) + if(NOT CUDA_ROOT) + set(CUDA_ROOT $ENV{CUDA_ROOT}) + endif() + if(CUDA_ROOT) + set(CUDAToolkit_ROOT ${CUDA_ROOT}) + endif() +endif() -IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") - MESSAGE(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC AND CMAKE_VERSION VERSION_LESS "3.20.1") + message(FATAL_ERROR "Using NVHPC as host compiler requires at least CMake 3.20.1") +endif() -IF(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") +if(CMAKE_VERSION VERSION_GREATER_EQUAL "3.17.0") find_package(CUDAToolkit REQUIRED) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - KOKKOS_EXPORT_CMAKE_TPL(CUDAToolkit REQUIRED) -ELSE() + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + kokkos_export_cmake_tpl(CUDAToolkit REQUIRED) +else() include(${CMAKE_CURRENT_LIST_DIR}/CudaToolkit.cmake) - IF (TARGET CUDA::cudart) - SET(FOUND_CUDART TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cudart) - ELSE() - SET(FOUND_CUDART FALSE) - ENDIF() + if(TARGET CUDA::cudart) + set(FOUND_CUDART TRUE) + kokkos_export_imported_tpl(CUDA::cudart) + else() + set(FOUND_CUDART FALSE) + endif() - IF (TARGET CUDA::cuda_driver) - SET(FOUND_CUDA_DRIVER TRUE) - KOKKOS_EXPORT_IMPORTED_TPL(CUDA::cuda_driver) - ELSE() - SET(FOUND_CUDA_DRIVER FALSE) - ENDIF() + if(TARGET CUDA::cuda_driver) + set(FOUND_CUDA_DRIVER TRUE) + kokkos_export_imported_tpl(CUDA::cuda_driver) + else() + set(FOUND_CUDA_DRIVER FALSE) + endif() include(FindPackageHandleStandardArgs) - FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) - IF (FOUND_CUDA_DRIVER AND FOUND_CUDART) - KOKKOS_CREATE_IMPORTED_TPL(CUDA INTERFACE - LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart - ) - ENDIF() -ENDIF() + find_package_handle_standard_args(TPLCUDA ${DEFAULT_MSG} FOUND_CUDART FOUND_CUDA_DRIVER) + if(FOUND_CUDA_DRIVER AND FOUND_CUDART) + kokkos_create_imported_tpl(CUDA INTERFACE LINK_LIBRARIES CUDA::cuda_driver CUDA::cudart) + endif() +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake index d7b54fb9c9..e3c199b7c5 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHPX.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHPX.cmake @@ -1,15 +1,10 @@ - -FIND_PACKAGE(HPX REQUIRED 1.8.0) +find_package(HPX REQUIRED 1.8.0) #as of right now, HPX doesn't export correctly #so let's convert it to an interface target -KOKKOS_CREATE_IMPORTED_TPL(HPX INTERFACE - LINK_LIBRARIES ${HPX_LIBRARIES} - INCLUDES ${HPX_INCLUDE_DIRS} -) +kokkos_create_imported_tpl(HPX INTERFACE LINK_LIBRARIES ${HPX_LIBRARIES} INCLUDES ${HPX_INCLUDE_DIRS}) #this is a bit funky since this is a CMake target #but HPX doesn't export itself correctly -KOKKOS_EXPORT_CMAKE_TPL(HPX) +kokkos_export_cmake_tpl(HPX) #I would prefer all of this gets replaced with #KOKKOS_IMPORT_CMAKE_TPL(HPX) - diff --git a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake index cf763b7e5b..77ce8c71f7 100644 --- a/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLHWLOC.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(HWLOC HEADER hwloc.h LIBRARY hwloc) +kokkos_find_imported(HWLOC HEADER hwloc.h LIBRARY hwloc) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake index 8adcdcdbb8..85ae0b8224 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBDL.cmake @@ -1 +1 @@ -KOKKOS_FIND_IMPORTED(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) +kokkos_find_imported(LIBDL HEADER dlfcn.h INTERFACE LIBRARIES ${CMAKE_DL_LIBS}) diff --git a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake index 70e0d6c454..ce428b0aee 100644 --- a/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLLIBQUADMATH.cmake @@ -2,17 +2,19 @@ # (which would not be contained in CMake's search paths anyway). # Hence, try if the compiler supports libquadmath natively first before doing # the standard package search. -SET(CMAKE_REQUIRED_LIBRARIES "quadmath") -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +set(CMAKE_REQUIRED_LIBRARIES "quadmath") +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main(void){ __float128 foo = ::sqrtq(123.456); return foo; }" - KOKKOS_QUADMATH_COMPILER_SUPPORT) -IF (KOKKOS_QUADMATH_COMPILER_SUPPORT) - KOKKOS_CREATE_IMPORTED_TPL(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) -ELSE() - KOKKOS_FIND_IMPORTED(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) -ENDIF() + KOKKOS_QUADMATH_COMPILER_SUPPORT +) +if(KOKKOS_QUADMATH_COMPILER_SUPPORT) + kokkos_create_imported_tpl(LIBQUADMATH INTERFACE LINK_LIBRARIES quadmath) +else() + kokkos_find_imported(LIBQUADMATH HEADER quadmath.h LIBRARY quadmath) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake index 603510c315..68de942a69 100644 --- a/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLONEDPL.cmake @@ -1,9 +1,10 @@ -INCLUDE(CheckIncludeFileCXX) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) -CHECK_INCLUDE_FILE_CXX(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) +include(CheckIncludeFileCXX) +check_include_file_cxx(oneapi/dpl/execution KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER) +check_include_file_cxx(oneapi/dpl/algorithm KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) -INCLUDE(CheckCXXSourceCompiles) -CHECK_CXX_SOURCE_COMPILES(" +include(CheckCXXSourceCompiles) +check_cxx_source_compiles( + " #include int main() @@ -13,37 +14,40 @@ CHECK_CXX_SOURCE_COMPILES(" #endif return 0; }" - KOKKOS_NO_TBB_CONFLICT) + KOKKOS_NO_TBB_CONFLICT +) -IF (KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE +if(KOKKOS_COMPILER_HAS_ONEDPL_EXECUTION_HEADER AND KOKKOS_COMPILER_HAS_ONEDPL_ALGORITHM_HEADER) + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() -ELSE() - FIND_PACKAGE(oneDPL REQUIRED) + endif() +else() + find_package(oneDPL REQUIRED) - IF(KOKKOS_NO_TBB_CONFLICT) - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL - ) - ELSE() - KOKKOS_CREATE_IMPORTED_TPL( - ONEDPL INTERFACE - LINK_LIBRARIES oneDPL + if(KOKKOS_NO_TBB_CONFLICT) + kokkos_create_imported_tpl(ONEDPL INTERFACE LINK_LIBRARIES oneDPL) + else() + kokkos_create_imported_tpl( + ONEDPL + INTERFACE + LINK_LIBRARIES + oneDPL # https://stackoverflow.com/questions/67923287/how-to-resolve-no-member-named-task-in-namespace-tbb-error-when-using-oned/ - COMPILE_DEFINITIONS PSTL_USE_PARALLEL_POLICIES=0 _GLIBCXX_USE_TBB_PAR_BACKEND=0 + COMPILE_DEFINITIONS + PSTL_USE_PARALLEL_POLICIES=0 + _GLIBCXX_USE_TBB_PAR_BACKEND=0 ) - ENDIF() + endif() # Export oneDPL as a Kokkos dependency - KOKKOS_EXPORT_CMAKE_TPL(oneDPL) -ENDIF() + kokkos_export_cmake_tpl(oneDPL) +endif() diff --git a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake index f796737f5b..9673af0b9d 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCM.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCM.cmake @@ -1,7 +1,7 @@ include(FindPackageHandleStandardArgs) -FIND_LIBRARY(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) -FIND_LIBRARY(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(AMD_HIP_LIBRARY amdhip64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) +find_library(HSA_RUNTIME_LIBRARY hsa-runtime64 PATHS ENV ROCM_PATH PATH_SUFFIXES lib) # FIXME_HIP Starting with ROCm 5.5 it is not necessary to link againt clang_rt. # We keep the code as is for now because it is hard to find the version of ROCM @@ -16,18 +16,24 @@ execute_process( COMMAND ${CMAKE_CXX_COMPILER} -print-libgcc-file-name --rtlib=compiler-rt OUTPUT_VARIABLE CLANG_RT_LIBRARY OUTPUT_STRIP_TRAILING_WHITESPACE - RESULT_VARIABLE CLANG_RT_CHECK) + RESULT_VARIABLE CLANG_RT_CHECK +) -if( NOT "${CLANG_RT_CHECK}" STREQUAL "0" ) +if(NOT "${CLANG_RT_CHECK}" STREQUAL "0") # if the above failed, we delete CLANG_RT_LIBRARY to make the args check # below fail unset(CLANG_RT_LIBRARY) endif() - find_package_handle_standard_args(TPLROCM DEFAULT_MSG AMD_HIP_LIBRARY HSA_RUNTIME_LIBRARY CLANG_RT_LIBRARY) -kokkos_create_imported_tpl(ROCM INTERFACE - LINK_LIBRARIES ${HSA_RUNTIME_LIBRARY} ${AMD_HIP_LIBRARY} ${CLANG_RT_LIBRARY} - COMPILE_DEFINITIONS __HIP_ROCclr__ +kokkos_create_imported_tpl( + ROCM + INTERFACE + LINK_LIBRARIES + ${HSA_RUNTIME_LIBRARY} + ${AMD_HIP_LIBRARY} + ${CLANG_RT_LIBRARY} + COMPILE_DEFINITIONS + __HIP_ROCclr__ ) diff --git a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake index dae7dc3c95..b4b905795d 100644 --- a/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLROCTHRUST.cmake @@ -6,10 +6,10 @@ # behavior of ROCm 5.7 and later for earlier version of ROCm we set # AMDGPU_TARGETS and GPU_TARGETS to empty and set the values in the cache. If # the values are not cached, FIND_PACKAGE(rocthrust) will overwrite them. -SET(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") -SET(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") -FIND_PACKAGE(rocthrust REQUIRED) -KOKKOS_CREATE_IMPORTED_TPL(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) +set(AMDGPU_TARGETS "" CACHE STRING "AMD GPU targets to compile for") +set(GPU_TARGETS "" CACHE STRING "GPU targets to compile for") +find_package(rocthrust REQUIRED) +kokkos_create_imported_tpl(ROCTHRUST INTERFACE LINK_LIBRARIES roc::rocthrust) # Export ROCTHRUST as a Kokkos dependency -KOKKOS_EXPORT_CMAKE_TPL(rocthrust) +kokkos_export_cmake_tpl(rocthrust) diff --git a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake index ff0db5123f..280b8641da 100644 --- a/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake +++ b/lib/kokkos/cmake/Modules/FindTPLTHREADS.cmake @@ -1,15 +1,14 @@ -INCLUDE(FindPackageHandleStandardArgs) -FIND_PACKAGE(Threads) +include(FindPackageHandleStandardArgs) +find_package(Threads) -IF (TARGET Threads::Threads) - SET(FOUND_THREADS TRUE) -ELSE() - SET(FOUND_THREADS FALSE) -ENDIF() +if(TARGET Threads::Threads) + set(FOUND_THREADS TRUE) +else() + set(FOUND_THREADS FALSE) +endif() -FIND_PACKAGE_HANDLE_STANDARD_ARGS(TPLTHREADS DEFAULT_MSG FOUND_THREADS) +find_package_handle_standard_args(TPLTHREADS DEFAULT_MSG FOUND_THREADS) #Only create the TPL if we succeed -IF (FOUND_THREADS) - KOKKOS_CREATE_IMPORTED_TPL(THREADS INTERFACE LINK_OPTIONS - ${CMAKE_THREAD_LIBS_INIT}) -ENDIF() +if(FOUND_THREADS) + kokkos_create_imported_tpl(THREADS INTERFACE LINK_OPTIONS ${CMAKE_THREAD_LIBS_INIT}) +endif() diff --git a/lib/kokkos/cmake/README.md b/lib/kokkos/cmake/README.md index 385bbfcd5d..0548e89a90 100644 --- a/lib/kokkos/cmake/README.md +++ b/lib/kokkos/cmake/README.md @@ -310,20 +310,6 @@ When Kokkos is loaded by a downstream project, this TPL must be loaded. Calling this function simply appends text recording the location where the TPL was found and adding a `find_dependency(...)` call that will reload the CMake target. -### The Great TriBITS Compromise - -TriBITS was a masterpiece of CMake version 2 before the modern CMake idioms of building and using. -TriBITS greatly limited verbosity of CMake files, handled complicated dependency trees between packages, and handled automatically setting up include and linker paths for dependent libraries. - -Kokkos is now used by numerous projects that don't (and won't) depend on TriBITS for their build systems. -Kokkos has to work outside of TriBITS and provide a standard CMake 3+ build system. -At the same time, Kokkos is used by numerous projects that depend on TriBITS and don't (and won't) switch to a standard CMake 3+ build system. - -Instead of calling functions `TRIBITS_X(...)`, the CMake calls wrapper functions `KOKKOS_X(...)`. -If TriBITS is available (as in Trilinos), `KOKKOS_X` will just be a thin wrapper around `TRIBITS_X`. -If TriBITS is not available, Kokkos maps `KOKKOS_X` calls to native CMake that complies with CMake 3 idioms. -For the time being, this seems the most sensible way to handle the competing requirements of a standalone modern CMake and TriBITS build system. - ##### [LICENSE](https://github.com/kokkos/kokkos/blob/devel/LICENSE) [![License](https://img.shields.io/badge/License-BSD%203--Clause-blue.svg)](https://opensource.org/licenses/BSD-3-Clause) diff --git a/lib/kokkos/cmake/build_env_info.cmake b/lib/kokkos/cmake/build_env_info.cmake index 0eeb637245..76afbb74b6 100644 --- a/lib/kokkos/cmake/build_env_info.cmake +++ b/lib/kokkos/cmake/build_env_info.cmake @@ -2,121 +2,118 @@ find_package(Git QUIET) -SET(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) -SET(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) -SET(post_configure_dir ${CMAKE_BINARY_DIR}/generated) +set(CURRENT_LIST_DIR ${CMAKE_CURRENT_LIST_DIR}) +set(pre_configure_dir ${CMAKE_CURRENT_LIST_DIR}) +set(post_configure_dir ${CMAKE_CURRENT_BINARY_DIR}/generated) -SET(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) -SET(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) +set(pre_configure_file ${pre_configure_dir}/Kokkos_Version_Info.cpp.in) +set(post_configure_file ${post_configure_dir}/Kokkos_Version_Info.cpp) -FUNCTION(check_git_write git_hash git_clean_status) - FILE( - WRITE - ${CMAKE_BINARY_DIR}/git-state.txt - "${git_hash}-${git_clean_status}") -ENDFUNCTION() +function(check_git_write git_hash git_clean_status) + file(WRITE ${CMAKE_BINARY_DIR}/git-state.txt "${git_hash}-${git_clean_status}") +endfunction() -FUNCTION(check_git_read git_hash) - IF(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) - FILE(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) - LIST(GET CONTENT 0 var) +function(check_git_read git_hash) + if(EXISTS ${CMAKE_BINARY_DIR}/git-state.txt) + file(STRINGS ${CMAKE_BINARY_DIR}/git-state.txt CONTENT) + list(GET CONTENT 0 var) message(DEBUG "Cached Git hash: ${var}") - SET(${git_hash} ${var} PARENT_SCOPE) + set(${git_hash} ${var} PARENT_SCOPE) else() - SET(${git_hash} "INVALID" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(${git_hash} "INVALID" PARENT_SCOPE) + endif() +endfunction() -FUNCTION(check_git_version) - IF(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) - FILE( - COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp - DESTINATION ${post_configure_dir}) - ENDIF() +function(check_git_version) + if(NOT EXISTS ${post_configure_dir}/Kokkos_Version_Info.hpp) + file(COPY ${pre_configure_dir}/Kokkos_Version_Info.hpp DESTINATION ${post_configure_dir}) + endif() - IF(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) + if(NOT Git_FOUND OR NOT EXISTS ${KOKKOS_SOURCE_DIR}/.git) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) return() - ENDIF() + endif() # Get the current working branch execute_process( COMMAND ${GIT_EXECUTABLE} rev-parse --abbrev-ref HEAD WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_BRANCH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit description execute_process( COMMAND ${GIT_EXECUTABLE} show -s --format=%s WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DESCRIPTION - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Get the latest commit date execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%cI WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_DATE - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) # Check if repo is dirty / clean execute_process( COMMAND ${GIT_EXECUTABLE} diff-index --quiet HEAD -- WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} RESULT_VARIABLE IS_DIRTY - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - IF(IS_DIRTY EQUAL 0) - SET(GIT_CLEAN_STATUS "CLEAN") + if(IS_DIRTY EQUAL 0) + set(GIT_CLEAN_STATUS "CLEAN") else() - SET(GIT_CLEAN_STATUS "DIRTY") - ENDIF() + set(GIT_CLEAN_STATUS "DIRTY") + endif() # Get the latest abbreviated commit hash of the working branch execute_process( COMMAND ${GIT_EXECUTABLE} log -1 --format=%h WORKING_DIRECTORY ${KOKKOS_SOURCE_DIR} OUTPUT_VARIABLE GIT_COMMIT_HASH - OUTPUT_STRIP_TRAILING_WHITESPACE) + OUTPUT_STRIP_TRAILING_WHITESPACE + ) check_git_read(GIT_HASH_CACHE) - IF(NOT EXISTS ${post_configure_dir}) + if(NOT EXISTS ${post_configure_dir}) file(MAKE_DIRECTORY ${post_configure_dir}) - ENDIF() + endif() # Only update the git_version.cpp if the hash has changed. This will # prevent us from rebuilding the project more than we need to. - IF(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} - OR NOT EXISTS ${post_configure_file}) + if(NOT "${GIT_COMMIT_HASH}-${GIT_CLEAN_STATUS}" STREQUAL ${GIT_HASH_CACHE} OR NOT EXISTS ${post_configure_file}) # Set the GIT_HASH_CACHE variable so the next build won't have # to regenerate the source file. check_git_write(${GIT_COMMIT_HASH} ${GIT_CLEAN_STATUS}) configure_file(${pre_configure_file} ${post_configure_file} @ONLY) message(STATUS "Configured git information in ${post_configure_file}") - ENDIF() -ENDFUNCTION() + endif() +endfunction() -FUNCTION(check_git_setup) +function(check_git_setup) add_custom_target( - AlwaysCheckGit COMMAND ${CMAKE_COMMAND} - -DRUN_CHECK_GIT_VERSION=1 - -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} - -P ${CURRENT_LIST_DIR}/build_env_info.cmake - BYPRODUCTS ${post_configure_file}) + AlwaysCheckGit COMMAND ${CMAKE_COMMAND} -DRUN_CHECK_GIT_VERSION=1 -DKOKKOS_SOURCE_DIR=${Kokkos_SOURCE_DIR} -P + ${CURRENT_LIST_DIR}/build_env_info.cmake BYPRODUCTS ${post_configure_file} + ) - add_library(impl_git_version ${CMAKE_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) + add_library(impl_git_version ${CMAKE_CURRENT_BINARY_DIR}/generated/Kokkos_Version_Info.cpp) target_include_directories(impl_git_version PUBLIC ${CMAKE_BINARY_DIR}/generated) target_compile_features(impl_git_version PRIVATE cxx_raw_string_literals) add_dependencies(impl_git_version AlwaysCheckGit) check_git_version() -ENDFUNCTION() +endfunction() # This is used to run this function from an external cmake process. -IF(RUN_CHECK_GIT_VERSION) +if(RUN_CHECK_GIT_VERSION) check_git_version() -ENDIF() +endif() diff --git a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp b/lib/kokkos/cmake/compile_tests/amd_apu.cc similarity index 57% rename from lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp rename to lib/kokkos/cmake/compile_tests/amd_apu.cc index 3c599b95a6..a9c1edbd57 100644 --- a/lib/kokkos/core/unit_test/sycl/TestSYCL_Task.cpp +++ b/lib/kokkos/cmake/compile_tests/amd_apu.cc @@ -14,5 +14,25 @@ // //@HEADER -#include -#include +#include +#include + +int main() { + hipDeviceProp_t hipProp; + hipError_t error = hipGetDeviceProperties(&hipProp, 0); + + if (error != hipSuccess) { + std::cout << hipGetErrorString(error) << '\n'; + return error; + } + + if (hipProp.integrated == 1) { + // We detected an APU + std::cout << "ON"; + } else { + // We detected a discrete GPU + std::cout << "OFF"; + } + + return 0; +} diff --git a/lib/kokkos/cmake/cray.cmake b/lib/kokkos/cmake/cray.cmake index 08912f5130..4ce5352bda 100644 --- a/lib/kokkos/cmake/cray.cmake +++ b/lib/kokkos/cmake/cray.cmake @@ -1,9 +1,6 @@ - - function(kokkos_set_cray_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "-hstd=c++${FULL_LC_STANDARD}", PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "-hstd=c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/deps/CUDA.cmake b/lib/kokkos/cmake/deps/CUDA.cmake index 5b6afd6151..49eaf883a4 100644 --- a/lib/kokkos/cmake/deps/CUDA.cmake +++ b/lib/kokkos/cmake/deps/CUDA.cmake @@ -17,24 +17,24 @@ # Check for CUDA support -SET(_CUDA_FAILURE OFF) +set(_CUDA_FAILURE OFF) # Have CMake find CUDA -IF(NOT _CUDA_FAILURE) - FIND_PACKAGE(CUDA 3.2) - IF (NOT CUDA_FOUND) - SET(_CUDA_FAILURE ON) - ENDIF() -ENDIF() +if(NOT _CUDA_FAILURE) + find_package(CUDA 3.2) + if(NOT CUDA_FOUND) + set(_CUDA_FAILURE ON) + endif() +endif() -IF(NOT _CUDA_FAILURE) +if(NOT _CUDA_FAILURE) # if we haven't met failure macro(PACKAGE_ADD_CUDA_LIBRARY cuda_target) - TRIBITS_ADD_LIBRARY(${cuda_target} ${ARGN} CUDALIBRARY) + tribits_add_library(${cuda_target} ${ARGN} CUDALIBRARY) endmacro() - GLOBAL_SET(TPL_CUDA_LIBRARY_DIRS) - GLOBAL_SET(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) - GLOBAL_SET(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) -ELSE() - SET(TPL_ENABLE_CUDA OFF) -ENDIF() + global_set(TPL_CUDA_LIBRARY_DIRS) + global_set(TPL_CUDA_INCLUDE_DIRS ${CUDA_TOOLKIT_INCLUDE}) + global_set(TPL_CUDA_LIBRARIES ${CUDA_CUDART_LIBRARY} ${CUDA_cublas_LIBRARY} ${CUDA_cufft_LIBRARY}) +else() + set(TPL_ENABLE_CUDA OFF) +endif() diff --git a/lib/kokkos/cmake/deps/HWLOC.cmake b/lib/kokkos/cmake/deps/HWLOC.cmake index 77d5a9b83a..52d8368d04 100644 --- a/lib/kokkos/cmake/deps/HWLOC.cmake +++ b/lib/kokkos/cmake/deps/HWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,7 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/deps/Pthread.cmake b/lib/kokkos/cmake/deps/Pthread.cmake index e879bff374..b811f85084 100644 --- a/lib/kokkos/cmake/deps/Pthread.cmake +++ b/lib/kokkos/cmake/deps/Pthread.cmake @@ -15,31 +15,27 @@ # ************************************************************************ # @HEADER +set(USE_THREADS FALSE) -SET(USE_THREADS FALSE) - -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(Pthread) -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") + kokkos_create_imported_tpl_library(Pthread) +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/deps/quadmath.cmake b/lib/kokkos/cmake/deps/quadmath.cmake index 6aef08e881..9006d0cb9e 100644 --- a/lib/kokkos/cmake/deps/quadmath.cmake +++ b/lib/kokkos/cmake/deps/quadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +kokkos_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/cmake/fake_tribits.cmake b/lib/kokkos/cmake/fake_tribits.cmake index a18d2ac518..d3fe1e6e2f 100644 --- a/lib/kokkos/cmake/fake_tribits.cmake +++ b/lib/kokkos/cmake/fake_tribits.cmake @@ -1,288 +1,213 @@ #These are tribits wrappers used by all projects in the Kokkos ecosystem -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) +include(CMakeParseArguments) +include(CTest) -FUNCTION(ASSERT_DEFINED VARS) - FOREACH(VAR ${VARS}) - IF(NOT DEFINED ${VAR}) - MESSAGE(SEND_ERROR "Error, the variable ${VAR} is not defined!") - ENDIF() - ENDFOREACH() -ENDFUNCTION() - -IF(NOT KOKKOS_HAS_TRILINOS) -MACRO(APPEND_GLOB VAR) - FILE(GLOB LOCAL_TMP_VAR ${ARGN}) - LIST(APPEND ${VAR} ${LOCAL_TMP_VAR}) -ENDMACRO() - -MACRO(GLOBAL_SET VARNAME) - SET(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) -ENDMACRO() - -MACRO(PREPEND_GLOBAL_SET VARNAME) - ASSERT_DEFINED(${VARNAME}) - GLOBAL_SET(${VARNAME} ${ARGN} ${${VARNAME}}) -ENDMACRO() -ENDIF() - -MACRO(ADD_INTERFACE_LIBRARY LIB_NAME) - FILE(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") - ADD_LIBRARY(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) - SET_TARGET_PROPERTIES(${LIB_NAME} PROPERTIES INTERFACE TRUE) -ENDMACRO() - -FUNCTION(KOKKOS_ADD_TEST) - if (KOKKOS_HAS_TRILINOS) - CMAKE_PARSE_ARGUMENTS(TEST - "SKIP_TRIBITS" - "EXE;NAME;TOOL" - "ARGS" - ${ARGN}) - - IF(TEST_SKIP_TRIBITS) - MESSAGE(STATUS "Skipping test ${TEST_NAME} in TriBits") - RETURN() - ENDIF() - - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - - TRIBITS_ADD_TEST( - ${EXE_ROOT} - NAME ${TEST_NAME} - COMM serial mpi - NUM_MPI_PROCS 1 - ARGS ${TEST_ARGS} - ${TEST_UNPARSED_ARGUMENTS} - ADDED_TESTS_NAMES_OUT ALL_TESTS_ADDED - ) - - # We will get prepended package name here - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - - # The function TRIBITS_ADD_TEST() has a CATEGORIES argument that defaults - # to BASIC. If a project elects to only enable tests marked as PERFORMANCE, - # the test won't actually be added and attempting to set a property on it below - # will yield an error. - if(TARGET ${EXE}) - if(TEST_TOOL) - add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - foreach(TEST_ADDED ${ALL_TESTS_ADDED}) - set_property(TEST ${TEST_ADDED} APPEND PROPERTY ENVIRONMENT "KOKKOS_TOOLS_LIBS=$") - endforeach() - endif() +function(ASSERT_DEFINED VARS) + foreach(VAR ${VARS}) + if(NOT DEFINED ${VAR}) + message(SEND_ERROR "Error, the variable ${VAR} is not defined!") endif() - else() - CMAKE_PARSE_ARGUMENTS(TEST - "WILL_FAIL;SKIP_TRIBITS" - "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" - "CATEGORIES;ARGS" - ${ARGN}) - # To match Tribits, we should always be receiving - # the root names of exes/libs - IF(TEST_EXE) - SET(EXE_ROOT ${TEST_EXE}) - ELSE() - SET(EXE_ROOT ${TEST_NAME}) - ENDIF() - # Prepend package name to the test name - # These should be the full target name - SET(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) - SET(EXE ${PACKAGE_NAME}_${EXE_ROOT}) - IF(WIN32) - ADD_TEST(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} - COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} ${TEST_ARGS}) - ELSE() - ADD_TEST(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS}) - ENDIF() - IF(TEST_WILL_FAIL) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) - ENDIF() - IF(TEST_FAIL_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_PASS_REGULAR_EXPRESSION) - SET_TESTS_PROPERTIES(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) - ENDIF() - IF(TEST_TOOL) - ADD_DEPENDENCIES(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool - SET_PROPERTY(TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$") - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) - ENDIF() -ENDFUNCTION() + endforeach() +endfunction() -MACRO(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) - ADD_INTERFACE_LIBRARY(TPL_LIB_${TPL_NAME}) - TARGET_LINK_LIBRARIES(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) - TARGET_INCLUDE_DIRECTORIES(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) -ENDMACRO() +macro(APPEND_GLOB VAR) + file(GLOB LOCAL_TMP_VAR ${ARGN}) + list(APPEND ${VAR} ${LOCAL_TMP_VAR}) +endmacro() -FUNCTION(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES(${TPL_NAME} ${ARGN}) - else() - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" - ${ARGN}) +macro(GLOBAL_SET VARNAME) + set(${VARNAME} ${ARGN} CACHE INTERNAL "" FORCE) +endmacro() - SET(_${TPL_NAME}_ENABLE_SUCCESS TRUE) - IF (PARSE_REQUIRED_LIBS_NAMES) - FIND_LIBRARY(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) - IF(NOT TPL_${TPL_NAME}_LIBRARIES) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (PARSE_REQUIRED_HEADERS) - FIND_PATH(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) - IF(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) - SET(_${TPL_NAME}_ENABLE_SUCCESS FALSE) - ENDIF() - ENDIF() - IF (_${TPL_NAME}_ENABLE_SUCCESS) - KOKKOS_CREATE_IMPORTED_TPL_LIBRARY(${TPL_NAME}) - ENDIF() - VERIFY_EMPTY(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) - endif() -ENDFUNCTION() +macro(PREPEND_GLOBAL_SET VARNAME) + assert_defined(${VARNAME}) + global_set(${VARNAME} ${ARGN} ${${VARNAME}}) +endmacro() -MACRO(KOKKOS_TARGET_COMPILE_OPTIONS TARGET) -if(KOKKOS_HAS_TRILINOS) - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -else() - TARGET_COMPILE_OPTIONS(${TARGET} ${ARGN}) -endif() -ENDMACRO() +macro(ADD_INTERFACE_LIBRARY LIB_NAME) + file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp "") + add_library(${LIB_NAME} STATIC ${CMAKE_CURRENT_BINARY_DIR}/dummy.cpp) + set_target_properties(${LIB_NAME} PROPERTIES INTERFACE TRUE) +endmacro() -FUNCTION(KOKKOS_LIB_TYPE LIB RET) -GET_TARGET_PROPERTY(PROP ${LIB} TYPE) -IF (${PROP} STREQUAL "INTERFACE_LIBRARY") - SET(${RET} "INTERFACE" PARENT_SCOPE) -ELSE() - SET(${RET} "PUBLIC" PARENT_SCOPE) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) -IF(KOKKOS_HAS_TRILINOS) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - #don't trust tribits to do this correctly - but need to add package name - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSEIF(TARGET ${TARGET}) - #the target actually exists - this means we are doing separate libs - #or this a test library - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} ${ARGN}) -ELSE() - GET_PROPERTY(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - IF (${TARGET} IN_LIST LIBS) - SET_PROPERTY(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) - ELSE() - MESSAGE(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") - ENDIF() -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) -IF(KOKKOS_HAS_TRILINOS) - #do nothing -ELSE() - SET(options INTERFACE) - SET(oneValueArgs) - SET(multiValueArgs) - CMAKE_PARSE_ARGUMENTS(PARSE - "INTERFACE" - "" - "" - ${ARGN}) - SET(LINK_TYPE) - IF(PARSE_INTERFACE) - SET(LINK_TYPE INTERFACE) - ELSE() - SET(LINK_TYPE PUBLIC) - ENDIF() - TARGET_LINK_LIBRARIES(${TARGET} ${LINK_TYPE} ${DEPLIB}) - VERIFY_EMPTY(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - -FUNCTION(KOKKOS_ADD_TEST_LIBRARY NAME) -IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN} TESTONLY) -ELSE() - SET(oneValueArgs) - SET(multiValueArgs HEADERS SOURCES) - - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES;DEPLIBS" - ${ARGN}) - - SET(LIB_TYPE) - IF (PARSE_STATIC) - SET(LIB_TYPE STATIC) - ELSEIF (PARSE_SHARED) - SET(LIB_TYPE SHARED) - ENDIF() - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - ADD_LIBRARY(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) - IF (PARSE_DEPLIBS) - TARGET_LINK_LIBRARIES(${NAME} PRIVATE ${PARSE_DEPLIBS}) - ENDIF() -ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_INCLUDE_DIRECTORIES) -IF(KOKKOS_HAS_TRILINOS) - TRIBITS_INCLUDE_DIRECTORIES(${ARGN}) -ELSE() - CMAKE_PARSE_ARGUMENTS( - INC - "REQUIRED_DURING_INSTALLATION_TESTING" - "" - "" +function(KOKKOS_ADD_TEST) + cmake_parse_arguments( + TEST "WILL_FAIL;SKIP_TRIBITS" "FAIL_REGULAR_EXPRESSION;PASS_REGULAR_EXPRESSION;EXE;NAME;TOOL" "CATEGORIES;ARGS" ${ARGN} ) - INCLUDE_DIRECTORIES(${INC_UNPARSED_ARGUMENTS}) -ENDIF() -ENDFUNCTION() - - -MACRO(PRINTALL match) -get_cmake_property(_variableNames VARIABLES) -list (SORT _variableNames) -foreach (_variableName ${_variableNames}) - if("${_variableName}" MATCHES "${match}") - message(STATUS "${_variableName}=${${_variableName}}") + # To match Tribits, we should always be receiving + # the root names of exes/libs + if(TEST_EXE) + set(EXE_ROOT ${TEST_EXE}) + else() + set(EXE_ROOT ${TEST_NAME}) endif() -endforeach() -ENDMACRO() + # Prepend package name to the test name + # These should be the full target name + set(TEST_NAME ${PACKAGE_NAME}_${TEST_NAME}) -MACRO(SET_GLOBAL_REPLACE SUBSTR VARNAME) - STRING(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDMACRO() + # For compatibility with Trilinos testing, we support: + # * `-D _DISABLE=ON` + # * `-D _EXTRA_ARGS=";;;..."` + # * `-D _SET_RUN_SERIAL=ON` + if(${TEST_NAME}_DISABLE) + return() + endif() -FUNCTION(GLOBAL_APPEND VARNAME) + set(EXE ${PACKAGE_NAME}_${EXE_ROOT}) + if(WIN32) + add_test(NAME ${TEST_NAME} WORKING_DIRECTORY ${LIBRARY_OUTPUT_PATH} COMMAND ${EXE}${CMAKE_EXECUTABLE_SUFFIX} + ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS} + ) + else() + add_test(NAME ${TEST_NAME} COMMAND ${EXE} ${TEST_ARGS} ${${TEST_NAME}_EXTRA_ARGS}) + endif() + # Trilinos testing benefits from labeling the tests as "Kokkos" tests + set_tests_properties(${TEST_NAME} PROPERTIES LABELS Kokkos) + if(${TEST_NAME}_SET_RUN_SERIAL) + set_tests_properties(${TEST_NAME} PROPERTIES RUN_SERIAL ON) + endif() + # TriBITS doesn't actually currently support `-D _ENVIRONMENT` + # but we decided to add it anyway + if(${TEST_NAME}_ENVIRONMENT) + set_tests_properties(${TEST_NAME} PROPERTIES ENVIRONMENT "${${TEST_NAME}_ENVIRONMENT}") + endif() + if(TEST_WILL_FAIL) + set_tests_properties(${TEST_NAME} PROPERTIES WILL_FAIL ${TEST_WILL_FAIL}) + endif() + if(TEST_FAIL_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES FAIL_REGULAR_EXPRESSION ${TEST_FAIL_REGULAR_EXPRESSION}) + endif() + if(TEST_PASS_REGULAR_EXPRESSION) + set_tests_properties(${TEST_NAME} PROPERTIES PASS_REGULAR_EXPRESSION ${TEST_PASS_REGULAR_EXPRESSION}) + endif() + if(TEST_TOOL) + add_dependencies(${EXE} ${TEST_TOOL}) #make sure the exe has to build the tool + set_property( + TEST ${TEST_NAME} APPEND_STRING PROPERTY ENVIRONMENT "KOKKOS_PROFILE_LIBRARY=$" + ) + endif() + verify_empty(KOKKOS_ADD_TEST ${TEST_UNPARSED_ARGUMENTS}) +endfunction() + +macro(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY TPL_NAME) + add_interface_library(TPL_LIB_${TPL_NAME}) + target_link_libraries(TPL_LIB_${TPL_NAME} LINK_PUBLIC ${TPL_${TPL_NAME}_LIBRARIES}) + target_include_directories(TPL_LIB_${TPL_NAME} INTERFACE ${TPL_${TPL_NAME}_INCLUDE_DIRS}) +endmacro() + +function(KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES TPL_NAME) + cmake_parse_arguments(PARSE "" "" "REQUIRED_HEADERS;REQUIRED_LIBS_NAMES" ${ARGN}) + + set(_${TPL_NAME}_ENABLE_SUCCESS TRUE) + if(PARSE_REQUIRED_LIBS_NAMES) + find_library(TPL_${TPL_NAME}_LIBRARIES NAMES ${PARSE_REQUIRED_LIBS_NAMES}) + if(NOT TPL_${TPL_NAME}_LIBRARIES) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(PARSE_REQUIRED_HEADERS) + find_path(TPL_${TPL_NAME}_INCLUDE_DIRS NAMES ${PARSE_REQUIRED_HEADERS}) + if(NOT TPL_${TPL_NAME}_INCLUDE_DIRS) + set(_${TPL_NAME}_ENABLE_SUCCESS FALSE) + endif() + endif() + if(_${TPL_NAME}_ENABLE_SUCCESS) + kokkos_create_imported_tpl_library(${TPL_NAME}) + endif() + verify_empty(KOKKOS_CREATE_IMPORTED_TPL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() + +function(KOKKOS_LIB_TYPE LIB RET) + get_target_property(PROP ${LIB} TYPE) + if(${PROP} STREQUAL "INTERFACE_LIBRARY") + set(${RET} "INTERFACE" PARENT_SCOPE) + else() + set(${RET} "PUBLIC" PARENT_SCOPE) + endif() +endfunction() + +function(KOKKOS_TARGET_INCLUDE_DIRECTORIES TARGET) + if(TARGET ${TARGET}) + #the target actually exists - this means we are doing separate libs + #or this a test library + kokkos_lib_type(${TARGET} INCTYPE) + target_include_directories(${TARGET} ${INCTYPE} ${ARGN}) + else() + get_property(LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + if(${TARGET} IN_LIST LIBS) + set_property(GLOBAL APPEND PROPERTY KOKKOS_LIBRARY_INCLUDES ${ARGN}) + else() + message(FATAL_ERROR "Trying to set include directories on unknown target ${TARGET}") + endif() + endif() +endfunction() + +function(KOKKOS_LINK_INTERNAL_LIBRARY TARGET DEPLIB) + set(options INTERFACE) + set(oneValueArgs) + set(multiValueArgs) + cmake_parse_arguments(PARSE "INTERFACE" "" "" ${ARGN}) + set(LINK_TYPE) + if(PARSE_INTERFACE) + set(LINK_TYPE INTERFACE) + else() + set(LINK_TYPE PUBLIC) + endif() + target_link_libraries(${TARGET} ${LINK_TYPE} ${DEPLIB}) + verify_empty(KOKKOS_LINK_INTERNAL_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endfunction() + +function(KOKKOS_ADD_TEST_LIBRARY NAME) + set(oneValueArgs) + set(multiValueArgs HEADERS SOURCES) + + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES;DEPLIBS" ${ARGN}) + + set(LIB_TYPE) + if(PARSE_STATIC) + set(LIB_TYPE STATIC) + elseif(PARSE_SHARED) + set(LIB_TYPE SHARED) + endif() + + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + add_library(${NAME} ${LIB_TYPE} ${PARSE_SOURCES}) + if(PARSE_DEPLIBS) + target_link_libraries(${NAME} PRIVATE ${PARSE_DEPLIBS}) + endif() +endfunction() + +function(KOKKOS_INCLUDE_DIRECTORIES) + cmake_parse_arguments(INC "REQUIRED_DURING_INSTALLATION_TESTING" "" "" ${ARGN}) + include_directories(${INC_UNPARSED_ARGUMENTS}) +endfunction() + +macro(PRINTALL match) + get_cmake_property(_variableNames VARIABLES) + list(SORT _variableNames) + foreach(_variableName ${_variableNames}) + if("${_variableName}" MATCHES "${match}") + message(STATUS "${_variableName}=${${_variableName}}") + endif() + endforeach() +endmacro() + +macro(SET_GLOBAL_REPLACE SUBSTR VARNAME) + string(REPLACE ${SUBSTR} ${${VARNAME}} TEMP) + global_set(${VARNAME} ${TEMP}) +endmacro() + +function(GLOBAL_APPEND VARNAME) #We make this a function since we are setting variables #and want to use scope to avoid overwriting local variables - SET(TEMP ${${VARNAME}}) - LIST(APPEND TEMP ${ARGN}) - GLOBAL_SET(${VARNAME} ${TEMP}) -ENDFUNCTION() + set(TEMP ${${VARNAME}}) + list(APPEND TEMP ${ARGN}) + global_set(${VARNAME} ${TEMP}) +endfunction() diff --git a/lib/kokkos/cmake/gnu.cmake b/lib/kokkos/cmake/gnu.cmake index aa11fe87b1..e53b4a7bec 100644 --- a/lib/kokkos/cmake/gnu.cmake +++ b/lib/kokkos/cmake/gnu.cmake @@ -1,23 +1,21 @@ - -FUNCTION(kokkos_set_gnu_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_gnu_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.7.2 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "-std=gnu++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=gnu++${INT_LC_STANDARD}" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "-std=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "-std=c++${INT_LC_STANDARD}" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/intel.cmake b/lib/kokkos/cmake/intel.cmake index 7e6ee3358c..b7752caabd 100644 --- a/lib/kokkos/cmake/intel.cmake +++ b/lib/kokkos/cmake/intel.cmake @@ -1,18 +1,15 @@ - -FUNCTION(kokkos_set_intel_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) +function(kokkos_set_intel_flags full_standard int_standard) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) # The following three blocks of code were copied from # /Modules/Compiler/Intel-CXX.cmake from CMake 3.18.1 and then modified. - IF(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) - SET(_std -Qstd) - SET(_ext c++) - ELSE() - SET(_std -std) - SET(_ext gnu++) - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) -ENDFUNCTION() - - + if(CMAKE_CXX_SIMULATE_ID STREQUAL MSVC) + set(_std -Qstd) + set(_ext c++) + else() + set(_std -std) + set(_ext gnu++) + endif() + set(KOKKOS_CXX_STANDARD_FLAG "${_std}=c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "${_std}=${_ext}${INT_LC_STANDARD}" PARENT_SCOPE) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_arch.cmake b/lib/kokkos/cmake/kokkos_arch.cmake index 0b3d4044d0..ae45da806f 100644 --- a/lib/kokkos/cmake/kokkos_arch.cmake +++ b/lib/kokkos/cmake/kokkos_arch.cmake @@ -1,611 +1,732 @@ - -FUNCTION(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) +function(KOKKOS_ARCH_OPTION SUFFIX DEV_TYPE DESCRIPTION DEPENDENCY) #all optimizations off by default - KOKKOS_DEPENDENT_OPTION(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) - SET(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - IF(KOKKOS_ARCH_${SUFFIX}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) - SET(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - + kokkos_dependent_option(ARCH_${SUFFIX} "Optimize for ${DESCRIPTION} (${DEV_TYPE})" OFF "${DEPENDENCY}" OFF) + set(KOKKOS_ARCH_${SUFFIX} ${KOKKOS_ARCH_${SUFFIX}} PARENT_SCOPE) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + if(KOKKOS_ARCH_${SUFFIX}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${SUFFIX}) + set(KOKKOS_ENABLED_ARCH_LIST ${KOKKOS_ENABLED_ARCH_LIST} PARENT_SCOPE) + endif() +endfunction() # Make sure devices and compiler ID are done -KOKKOS_CFG_DEPENDS(ARCH COMPILER_ID) -KOKKOS_CFG_DEPENDS(ARCH DEVICES) -KOKKOS_CFG_DEPENDS(ARCH OPTIONS) +kokkos_cfg_depends(ARCH COMPILER_ID) +kokkos_cfg_depends(ARCH DEVICES) +kokkos_cfg_depends(ARCH OPTIONS) -KOKKOS_CHECK_DEPRECATED_OPTIONS( - ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" - ARCH_RYZEN "Please replace RYZEN with ZEN or ZEN2, depending on your platform" +kokkos_check_deprecated_options( + ARCH_EPYC "Please replace EPYC with ZEN or ZEN2, depending on your platform" ARCH_RYZEN + "Please replace RYZEN with ZEN or ZEN2, depending on your platform" ) #------------------------------------------------------------------------------- # List of possible host architectures. #------------------------------------------------------------------------------- -SET(KOKKOS_ARCH_LIST) +set(KOKKOS_ARCH_LIST) include(CheckCXXCompilerFlag) -KOKKOS_DEPRECATED_LIST(ARCH ARCH) +kokkos_deprecated_list(ARCH ARCH) -SET(HOST_ARCH_ALREADY_SPECIFIED "") -MACRO(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) - KOKKOS_ARCH_OPTION(${ARCH} HOST "${LABEL}" TRUE) - IF(KOKKOS_ARCH_${ARCH}) - IF(HOST_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) - ENDIF() -ENDMACRO() +set(HOST_ARCH_ALREADY_SPECIFIED "") +macro(DECLARE_AND_CHECK_HOST_ARCH ARCH LABEL) + kokkos_arch_option(${ARCH} HOST "${LABEL}" TRUE) + if(KOKKOS_ARCH_${ARCH}) + if(HOST_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple host architectures given! Already have ${HOST_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(HOST_ARCH_ALREADY_SPECIFIED ${ARCH}) + endif() +endmacro() -DECLARE_AND_CHECK_HOST_ARCH(NATIVE "local machine") -DECLARE_AND_CHECK_HOST_ARCH(AMDAVX "AMD chip") -DECLARE_AND_CHECK_HOST_ARCH(ARMV80 "ARMv8.0 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV81 "ARMv8.1 Compatible CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") -DECLARE_AND_CHECK_HOST_ARCH(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") -DECLARE_AND_CHECK_HOST_ARCH(A64FX "ARMv8.2 with SVE Support") -DECLARE_AND_CHECK_HOST_ARCH(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") -DECLARE_AND_CHECK_HOST_ARCH(SNB "Intel Sandy/Ivy Bridge CPUs") -DECLARE_AND_CHECK_HOST_ARCH(HSW "Intel Haswell CPUs") -DECLARE_AND_CHECK_HOST_ARCH(BDW "Intel Broadwell Xeon E-class CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ICL "Intel Ice Lake Client CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(SKL "Intel Skylake Client CPUs") -DECLARE_AND_CHECK_HOST_ARCH(SKX "Intel Skylake Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(KNC "Intel Knights Corner Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(KNL "Intel Knights Landing Xeon Phi") -DECLARE_AND_CHECK_HOST_ARCH(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") -DECLARE_AND_CHECK_HOST_ARCH(POWER8 "IBM POWER8 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(POWER9 "IBM POWER9 CPUs") -DECLARE_AND_CHECK_HOST_ARCH(ZEN "AMD Zen architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN2 "AMD Zen2 architecture") -DECLARE_AND_CHECK_HOST_ARCH(ZEN3 "AMD Zen3 architecture") -DECLARE_AND_CHECK_HOST_ARCH(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(NATIVE "local machine") +declare_and_check_host_arch(AMDAVX "AMD chip") +declare_and_check_host_arch(ARMV80 "ARMv8.0 Compatible CPU") +declare_and_check_host_arch(ARMV81 "ARMv8.1 Compatible CPU") +declare_and_check_host_arch(ARMV8_THUNDERX "ARMv8 Cavium ThunderX CPU") +declare_and_check_host_arch(ARMV8_THUNDERX2 "ARMv8 Cavium ThunderX2 CPU") +declare_and_check_host_arch(A64FX "ARMv8.2 with SVE Support") +declare_and_check_host_arch(ARMV9_GRACE "ARMv9 NVIDIA Grace CPU") +declare_and_check_host_arch(SNB "Intel Sandy/Ivy Bridge CPUs") +declare_and_check_host_arch(HSW "Intel Haswell CPUs") +declare_and_check_host_arch(BDW "Intel Broadwell Xeon E-class CPUs") +declare_and_check_host_arch(ICL "Intel Ice Lake Client CPUs (AVX512)") +declare_and_check_host_arch(ICX "Intel Ice Lake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(SKL "Intel Skylake Client CPUs") +declare_and_check_host_arch(SKX "Intel Skylake Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(KNC "Intel Knights Corner Xeon Phi") +declare_and_check_host_arch(KNL "Intel Knights Landing Xeon Phi") +declare_and_check_host_arch(SPR "Intel Sapphire Rapids Xeon Server CPUs (AVX512)") +declare_and_check_host_arch(POWER8 "IBM POWER8 CPUs") +declare_and_check_host_arch(POWER9 "IBM POWER9 CPUs") +declare_and_check_host_arch(ZEN "AMD Zen architecture") +declare_and_check_host_arch(ZEN2 "AMD Zen2 architecture") +declare_and_check_host_arch(ZEN3 "AMD Zen3 architecture") +declare_and_check_host_arch(RISCV_SG2042 "SG2042 (RISC-V) CPUs") +declare_and_check_host_arch(RISCV_RVA22V "RVA22V (RISC-V) CPUs") -IF(Kokkos_ENABLE_CUDA OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_CUDA_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_CUDA + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_CUDA_ARCHS ON) +endif() -KOKKOS_ARCH_OPTION(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") -KOKKOS_ARCH_OPTION(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER30 GPU "NVIDIA Kepler generation CC 3.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER32 GPU "NVIDIA Kepler generation CC 3.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER35 GPU "NVIDIA Kepler generation CC 3.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(KEPLER37 GPU "NVIDIA Kepler generation CC 3.7" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL50 GPU "NVIDIA Maxwell generation CC 5.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL52 GPU "NVIDIA Maxwell generation CC 5.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(MAXWELL53 GPU "NVIDIA Maxwell generation CC 5.3" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL60 GPU "NVIDIA Pascal generation CC 6.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(PASCAL61 GPU "NVIDIA Pascal generation CC 6.1" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA70 GPU "NVIDIA Volta generation CC 7.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(VOLTA72 GPU "NVIDIA Volta generation CC 7.2" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(TURING75 GPU "NVIDIA Turing generation CC 7.5" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE80 GPU "NVIDIA Ampere generation CC 8.0" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(AMPERE86 GPU "NVIDIA Ampere generation CC 8.6" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(ADA89 GPU "NVIDIA Ada generation CC 8.9" "KOKKOS_SHOW_CUDA_ARCHS") +kokkos_arch_option(HOPPER90 GPU "NVIDIA Hopper generation CC 9.0" "KOKKOS_SHOW_CUDA_ARCHS") -IF(Kokkos_ENABLE_HIP OR Kokkos_ENABLE_OPENMPTARGET OR Kokkos_ENABLE_OPENACC OR Kokkos_ENABLE_SYCL) - SET(KOKKOS_SHOW_HIP_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_HIP + OR Kokkos_ENABLE_OPENMPTARGET + OR Kokkos_ENABLE_OPENACC + OR Kokkos_ENABLE_SYCL +) + set(KOKKOS_SHOW_HIP_ARCHS ON) +endif() # AMD archs ordered in decreasing priority of autodetection -LIST(APPEND SUPPORTED_AMD_GPUS MI300 MI300) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX940) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx940) -LIST(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) -LIST(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) -LIST(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) -LIST(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) -LIST(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) -LIST(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) +list(APPEND SUPPORTED_AMD_GPUS MI300 MI300A MI300) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX942 AMD_GFX942_APU AMD_GFX940) +list(APPEND CORRESPONDING_AMD_FLAGS gfx942 gfx942 gfx940) +list(APPEND SUPPORTED_AMD_GPUS MI200 MI200 MI100 MI100) +list(APPEND SUPPORTED_AMD_ARCHS VEGA90A AMD_GFX90A VEGA908 AMD_GFX908) +list(APPEND CORRESPONDING_AMD_FLAGS gfx90a gfx90a gfx908 gfx908) +list(APPEND SUPPORTED_AMD_GPUS MI50/60 MI50/60) +list(APPEND SUPPORTED_AMD_ARCHS VEGA906 AMD_GFX906) +list(APPEND CORRESPONDING_AMD_FLAGS gfx906 gfx906) +list(APPEND SUPPORTED_AMD_GPUS PHOENIX RX7900XTX V620/W6800 V620/W6800) +list(APPEND SUPPORTED_AMD_ARCHS AMD_GFX1103 AMD_GFX1100 NAVI1030 AMD_GFX1030) +list(APPEND CORRESPONDING_AMD_FLAGS gfx1103 gfx1100 gfx1030 gfx1030) #FIXME CAN BE REPLACED WITH LIST_ZIP IN CMAKE 3.17 -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - KOKKOS_ARCH_OPTION(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET SUPPORTED_AMD_GPUS ${LIST_INDEX} GPU) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + kokkos_arch_option(${ARCH} GPU "AMD GPU ${GPU} ${FLAG}" "KOKKOS_SHOW_HIP_ARCHS") +endforeach() -IF(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) - SET(KOKKOS_SHOW_SYCL_ARCHS ON) -ENDIF() +if(Kokkos_ENABLE_SYCL OR Kokkos_ENABLE_OPENMPTARGET) + set(KOKKOS_SHOW_SYCL_ARCHS ON) +endif() -KOKKOS_ARCH_OPTION(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") -KOKKOS_ARCH_OPTION(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN GPU "SPIR64-based devices, e.g. Intel GPUs, using JIT" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_DG1 GPU "Intel Iris XeMAX GPU" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN9 GPU "Intel GPU Gen9" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN11 GPU "Intel GPU Gen11" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_GEN12LP GPU "Intel GPU Gen12LP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_XEHP GPU "Intel GPU Xe-HP" "KOKKOS_SHOW_SYCL_ARCHS") +kokkos_arch_option(INTEL_PVC GPU "Intel GPU Ponte Vecchio" "KOKKOS_SHOW_SYCL_ARCHS") -IF(KOKKOS_ENABLE_COMPILER_WARNINGS) - SET(COMMON_WARNINGS - "-Wall" "-Wextra" "-Wunused-parameter" "-Wshadow" "-pedantic" - "-Wsign-compare" "-Wtype-limits" "-Wuninitialized") +if(KOKKOS_ENABLE_COMPILER_WARNINGS) + set(COMMON_WARNINGS + "-Wall" + "-Wextra" + "-Wunused-parameter" + "-Wshadow" + "-pedantic" + "-Wsign-compare" + "-Wtype-limits" + "-Wuninitialized" + "-Wsuggest-override" + ) # NOTE KOKKOS_ prefixed variable (all uppercase) is not set yet because TPLs are processed after ARCH - IF(Kokkos_ENABLE_LIBQUADMATH) + if(Kokkos_ENABLE_LIBQUADMATH) # warning: non-standard suffix on floating constant [-Wpedantic] - LIST(REMOVE_ITEM COMMON_WARNINGS "-pedantic") - ENDIF() + list(REMOVE_ITEM COMMON_WARNINGS "-pedantic") + endif() # NVHPC compiler does not support -Wtype-limits. - IF(KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") - ENDIF() - ENDIF() + if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + list(REMOVE_ITEM COMMON_WARNINGS "-Wtype-limits") + endif() + endif() - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - LIST(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + # ICPC doesn't support -Wsuggest-override + if(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + list(REMOVE_ITEM COMMON_WARNINGS "-Wsuggest-override") + endif() - SET(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" - ${COMMON_WARNINGS}) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - LIST(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") - ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + list(APPEND COMMON_WARNINGS "-Wimplicit-fallthrough") + endif() + + set(GNU_WARNINGS "-Wempty-body" "-Wclobbered" "-Wignored-qualifiers" ${COMMON_WARNINGS}) + if(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + list(APPEND GNU_WARNINGS "-Wimplicit-fallthrough") + endif() # Not using COMPILER_SPECIFIC_FLAGS function so the warning flags are not passed downstream - IF(CMAKE_CXX_COMPILER_ID STREQUAL GNU) - STRING(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") - ELSEIF(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) + if(CMAKE_CXX_COMPILER_ID STREQUAL GNU) + string(REPLACE ";" " " WARNING_FLAGS "${GNU_WARNINGS}") + elseif(CMAKE_CXX_COMPILER_ID STREQUAL NVHPC) # FIXME_NVHPC - ELSE() - STRING(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") - ENDIF() - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") -ENDIF() - + else() + string(REPLACE ";" " " WARNING_FLAGS "${COMMON_WARNINGS}") + endif() + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${WARNING_FLAGS}") +endif() #------------------------------- KOKKOS_CUDA_OPTIONS --------------------------- #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_CUDA_OPTIONS) +global_set(KOKKOS_CUDA_OPTIONS) # Construct the Makefile options -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-extended-lambda") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-extended-lambda") + global_append(KOKKOS_CUDA_OPTIONS "-Wext-lambda-captures-this") +endif() -IF (KOKKOS_ENABLE_CUDA_CONSTEXPR) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_CUDA_CONSTEXPR) + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_CUDA_OPTIONS "-expt-relaxed-constexpr") + endif() +endif() -IF (KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - SET(CUDA_ARCH_FLAG "--cuda-gpu-arch") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -x cuda) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + set(CUDA_ARCH_FLAG "--cuda-gpu-arch") + global_append(KOKKOS_CUDA_OPTIONS -x cuda) # Kokkos_CUDA_DIR has priority over CUDAToolkit_BIN_DIR - IF (Kokkos_CUDA_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) - ELSEIF(CUDAToolkit_BIN_DIR) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - SET(CUDA_ARCH_FLAG "-arch") -ENDIF() - -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - STRING(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) - IF (KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -lineinfo) - ENDIF() - UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -ENDIF() + if(Kokkos_CUDA_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${Kokkos_CUDA_DIR}) + elseif(CUDAToolkit_BIN_DIR) + global_append(KOKKOS_CUDA_OPTIONS --cuda-path=${CUDAToolkit_BIN_DIR}/..) + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + set(CUDA_ARCH_FLAG "-arch") +endif() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + string(TOUPPER "${CMAKE_BUILD_TYPE}" _UPPERCASE_CMAKE_BUILD_TYPE) + if(KOKKOS_ENABLE_DEBUG OR _UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + global_append(KOKKOS_CUDA_OPTIONS -lineinfo) + endif() + unset(_UPPERCASE_CMAKE_BUILD_TYPE) +endif() #------------------------------- KOKKOS_HIP_OPTIONS --------------------------- -KOKKOS_OPTION(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") -KOKKOS_OPTION(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_FLAGS) -MARK_AS_ADVANCED(Kokkos_IMPL_AMDGPU_LINK) +kokkos_option(IMPL_AMDGPU_FLAGS "" STRING "Set compiler flags for AMD GPUs") +kokkos_option(IMPL_AMDGPU_LINK "" STRING "Set linker flags for AMD GPUs") +mark_as_advanced(Kokkos_IMPL_AMDGPU_FLAGS) +mark_as_advanced(Kokkos_IMPL_AMDGPU_LINK) #clear anything that might be in the cache -GLOBAL_SET(KOKKOS_AMDGPU_OPTIONS) -IF(KOKKOS_ENABLE_HIP) - SET(AMDGPU_ARCH_FLAG "--offload-arch") - IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF (NOT CMAKE_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS -xhip) - IF(DEFINED ENV{ROCM_PATH}) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) - ENDIF() - ENDIF() -ENDIF() +global_set(KOKKOS_AMDGPU_OPTIONS) +if(KOKKOS_ENABLE_HIP) + set(AMDGPU_ARCH_FLAG "--offload-arch") + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(NOT CMAKE_CXX_STANDARD) + message(FATAL_ERROR "Kokkos requires CMAKE_CXX_STANDARD to set to 17 or higher") + endif() + global_append(KOKKOS_AMDGPU_OPTIONS -xhip) + if(DEFINED ENV{ROCM_PATH}) + global_append(KOKKOS_AMDGPU_OPTIONS --rocm-path=$ENV{ROCM_PATH}) + endif() + endif() +endif() +if(KOKKOS_ARCH_NATIVE) + if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") + message(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") + endif() -IF(KOKKOS_ARCH_NATIVE) - IF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC") - MESSAGE(FATAL_ERROR "MSVC doesn't support ARCH_NATIVE!") - ENDIF() + string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) + if(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") + set(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") + else() + set(KOKKOS_NATIVE_FLAGS "-mcpu=native") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID NVHPC -tp=native DEFAULT ${KOKKOS_NATIVE_FLAGS}) +endif() - STRING(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" KOKKOS_UC_SYSTEM_PROCESSOR) - IF(KOKKOS_UC_SYSTEM_PROCESSOR MATCHES "(X86)|(AMD64)") - SET(KOKKOS_NATIVE_FLAGS "-march=native;-mtune=native") - ELSE() - SET(KOKKOS_NATIVE_FLAGS "-mcpu=native") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - NVHPC -tp=native - DEFAULT ${KOKKOS_NATIVE_FLAGS} +if(KOKKOS_ARCH_ARMV80) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV80) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a +if(KOKKOS_ARCH_ARMV81) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.1-a ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV81) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.1-a +if(KOKKOS_ARCH_ARMV8_THUNDERX) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.0 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8-a + -mtune=thunderx ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV8_THUNDERX) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV80 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.0 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8-a -mtune=thunderx +if(KOKKOS_ARCH_ARMV8_THUNDERX2) + set(KOKKOS_ARCH_ARM_NEON ON) + set(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + MSVC + /arch:armv8.1 + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=thunderx2t99 + -mtune=thunderx2t99 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ARMV8_THUNDERX2) - SET(KOKKOS_ARCH_ARM_NEON ON) - SET(KOKKOS_ARCH_ARMV81 ON) #Not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - MSVC /arch:armv8.1 - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=thunderx2t99 -mtune=thunderx2t99 +if(KOKKOS_ARCH_A64FX) + set(KOKKOS_ARCH_ARM_NEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Clang + -march=armv8.2-a+sve + -msve-vector-bits=512 + GNU + -march=armv8.2-a+sve + -msve-vector-bits=512 + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -march=armv8.2-a+sve ) -ENDIF() +endif() -IF (KOKKOS_ARCH_A64FX) - SET(KOKKOS_ARCH_ARM_NEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Clang -march=armv8.2-a+sve -msve-vector-bits=512 - GNU -march=armv8.2-a+sve -msve-vector-bits=512 - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -march=armv8.2-a+sve - ) -ENDIF() - -IF (KOKKOS_ARCH_ARMV9_GRACE) - SET(KOKKOS_ARCH_ARM_NEON ON) +if(KOKKOS_ARCH_ARMV9_GRACE) + set(KOKKOS_ARCH_ARM_NEON ON) check_cxx_compiler_flag("-mcpu=neoverse-n2" COMPILER_SUPPORTS_NEOVERSE_N2) check_cxx_compiler_flag("-msve-vector-bits=128" COMPILER_SUPPORTS_SVE_VECTOR_BITS) - IF (COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128 - ) - ELSE() - MESSAGE(WARNING "Compiler does not support ARMv9 Grace architecture") - ENDIF() -ENDIF() + if(COMPILER_SUPPORTS_NEOVERSE_N2 AND COMPILER_SUPPORTS_SVE_VECTOR_BITS) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -mcpu=neoverse-n2 -msve-vector-bits=128) + else() + message(WARNING "Compiler does not support ARMv9 Grace architecture") + endif() +endif() -IF (KOKKOS_ARCH_ZEN) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen - DEFAULT -march=znver1 -mtune=znver1 +if(KOKKOS_ARCH_ZEN) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen + DEFAULT + -march=znver1 + -mtune=znver1 ) - SET(KOKKOS_ARCH_AMD_ZEN ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_ZEN2) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver2 -mtune=znver2 +if(KOKKOS_ARCH_ZEN2) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver2 + -mtune=znver2 ) - SET(KOKKOS_ARCH_AMD_ZEN2 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN2 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_ZEN3) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Intel -mavx2 - MSVC /arch:AVX2 - NVHPC -tp=zen2 - DEFAULT -march=znver3 -mtune=znver3 +if(KOKKOS_ARCH_ZEN3) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Intel + -mavx2 + MSVC + /arch:AVX2 + NVHPC + -tp=zen2 + DEFAULT + -march=znver3 + -mtune=znver3 ) - SET(KOKKOS_ARCH_AMD_ZEN3 ON) - SET(KOKKOS_ARCH_AVX2 ON) -ENDIF() + set(KOKKOS_ARCH_AMD_ZEN3 ON) + set(KOKKOS_ARCH_AVX2 ON) +endif() -IF (KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) - SET(KOKKOS_ARCH_AVX ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -mavx - MSVC /arch:AVX - NVHPC -tp=sandybridge - DEFAULT -mavx +if(KOKKOS_ARCH_SNB OR KOKKOS_ARCH_AMDAVX) + set(KOKKOS_ARCH_AVX ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -mavx + MSVC + /arch:AVX + NVHPC + -tp=sandybridge + DEFAULT + -mavx ) -ENDIF() +endif() -IF (KOKKOS_ARCH_HSW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 +if(KOKKOS_ARCH_HSW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_RISCV_SG2042) - IF(NOT - (KOKKOS_CXX_COMPILER_ID STREQUAL GNU - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) - OR - (KOKKOS_CXX_COMPILER_ID STREQUAL Clang - AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) +if(KOKKOS_ARCH_RISCV_SG2042) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) - MESSAGE(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") - ENDIF() - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - DEFAULT -march=rv64imafdcv - ) -ENDIF() + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT -march=rv64imafdcv) +endif() - -IF (KOKKOS_ARCH_BDW) - SET(KOKKOS_ARCH_AVX2 ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX2 - MSVC /arch:AVX2 - NVHPC -tp=haswell - DEFAULT -march=core-avx2 -mtune=core-avx2 -mrtm +if(KOKKOS_ARCH_RISCV_RVA22V) + if(NOT (KOKKOS_CXX_COMPILER_ID STREQUAL GNU AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND KOKKOS_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 14) ) -ENDIF() + message(SEND_ERROR "Only gcc >= 12 and clang >= 14 support RISC-V.") + endif() + compiler_specific_flags( + COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID DEFAULT + -march=rv64imafdcv_sscofpmf_sstc_svpbmt_zicbom_zicboz_zicbop_zihintpause + ) +endif() -IF (KOKKOS_ARCH_KNL) +if(KOKKOS_ARCH_BDW) + set(KOKKOS_ARCH_AVX2 ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX2 + MSVC + /arch:AVX2 + NVHPC + -tp=haswell + DEFAULT + -march=core-avx2 + -mtune=core-avx2 + -mrtm + ) +endif() + +if(KOKKOS_ARCH_KNL) #avx512-mic - SET(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xMIC-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=knl - DEFAULT -march=knl -mtune=knl + set(KOKKOS_ARCH_AVX512MIC ON) #not a cache variable + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xMIC-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=knl + DEFAULT + -march=knl + -mtune=knl ) -ENDIF() +endif() -IF (KOKKOS_ARCH_KNC) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - DEFAULT -mmic - ) -ENDIF() +if(KOKKOS_ARCH_KNC) + compiler_specific_flags(COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID MSVC NO-VALUE-SPECIFIED DEFAULT -mmic) +endif() -IF (KOKKOS_ARCH_SKL) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xSKYLAKE - MSVC /arch:AVX2 - NVHPC -tp=skylake - DEFAULT -march=skylake -mtune=skylake +if(KOKKOS_ARCH_SKL) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xSKYLAKE + MSVC + /arch:AVX2 + NVHPC + -tp=skylake + DEFAULT + -march=skylake + -mtune=skylake ) -ENDIF() +endif() -IF (KOKKOS_ARCH_SKX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - Cray NO-VALUE-SPECIFIED - Intel -xCORE-AVX512 - MSVC /arch:AVX512 - NVHPC -tp=skylake - DEFAULT -march=skylake-avx512 -mtune=skylake-avx512 +if(KOKKOS_ARCH_SKX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + Cray + NO-VALUE-SPECIFIED + Intel + -xCORE-AVX512 + MSVC + /arch:AVX512 + NVHPC + -tp=skylake + DEFAULT + -march=skylake-avx512 + -mtune=skylake-avx512 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ICL) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-client -mtune=icelake-client +if(KOKKOS_ARCH_ICL) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-client + -mtune=icelake-client ) -ENDIF() +endif() -IF (KOKKOS_ARCH_ICX) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=icelake-server -mtune=icelake-server +if(KOKKOS_ARCH_ICX) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=icelake-server + -mtune=icelake-server ) -ENDIF() +endif() -IF (KOKKOS_ARCH_SPR) - SET(KOKKOS_ARCH_AVX512XEON ON) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC /arch:AVX512 - DEFAULT -march=sapphirerapids -mtune=sapphirerapids +if(KOKKOS_ARCH_SPR) + set(KOKKOS_ARCH_AVX512XEON ON) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + /arch:AVX512 + DEFAULT + -march=sapphirerapids + -mtune=sapphirerapids ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER7) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC NO-VALUE-SPECIFIED - DEFAULT -mcpu=power7 -mtune=power7 +if(KOKKOS_ARCH_POWER7) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + NO-VALUE-SPECIFIED + DEFAULT + -mcpu=power7 + -mtune=power7 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER8) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr8 - DEFAULT -mcpu=power8 -mtune=power8 +if(KOKKOS_ARCH_POWER8) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr8 + DEFAULT + -mcpu=power8 + -mtune=power8 ) -ENDIF() +endif() -IF (KOKKOS_ARCH_POWER9) - COMPILER_SPECIFIC_FLAGS( - COMPILER_ID KOKKOS_CXX_HOST_COMPILER_ID - MSVC NO-VALUE-SPECIFIED - NVHPC -tp=pwr9 - DEFAULT -mcpu=power9 -mtune=power9 +if(KOKKOS_ARCH_POWER9) + compiler_specific_flags( + COMPILER_ID + KOKKOS_CXX_HOST_COMPILER_ID + MSVC + NO-VALUE-SPECIFIED + NVHPC + -tp=pwr9 + DEFAULT + -mcpu=power9 + -mtune=power9 ) -ENDIF() +endif() # If Kokkos_ARCH_NATIVE is enabled, we are trying to autodetect # the SIMD capabilities based on compiler macros. -IF (KOKKOS_ARCH_NATIVE) +if(KOKKOS_ARCH_NATIVE) # Make sure to rerun the checks if compile options have changed - IF(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") - SET(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") + if(NOT "${KOKKOS_COMPILE_OPTIONS}" STREQUAL "${KOKKOS_COMPILE_OPTIONS_SAVED}") + set(KOKKOS_COMPILE_OPTIONS_SAVED "${KOKKOS_COMPILE_OPTIONS}" CACHE INTERNAL "") - SET(CMAKE_REQUIRED_QUIET ON) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) + set(CMAKE_REQUIRED_QUIET ON) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) - UNSET(KOKKOS_COMPILER_HAS_AVX512 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) - UNSET(KOKKOS_COMPILER_HAS_AVX2 CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) - UNSET(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) - CHECK_CXX_SYMBOL_EXISTS(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) - UNSET(KOKKOS_COMPILER_HAS_AVX CACHE) - CHECK_CXX_SYMBOL_EXISTS(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) - SET(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + unset(KOKKOS_COMPILER_HAS_AVX512 CACHE) + check_cxx_symbol_exists(__AVX512F__ "" KOKKOS_COMPILER_HAS_AVX512) + unset(KOKKOS_COMPILER_HAS_AVX2 CACHE) + check_cxx_symbol_exists(__AVX2__ "" KOKKOS_COMPILER_HAS_AVX2) + unset(KOKKOS_COMPILER_HAS_ARM_NEON CACHE) + check_cxx_symbol_exists(__ARM_NEON "" KOKKOS_COMPILER_HAS_ARM_NEON) + unset(KOKKOS_COMPILER_HAS_AVX CACHE) + check_cxx_symbol_exists(__AVX__ "" KOKKOS_COMPILER_HAS_AVX) + set(CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - UNSET(CMAKE_REQUIRED_QUIET) - UNSET(CMAKE_REQUIRED_FLAGS) - ENDIF() + unset(CMAKE_REQUIRED_QUIET) + unset(CMAKE_REQUIRED_FLAGS) + endif() # Only define one of these macros for now # to be uniform with what we are doing for other architectures. - IF(KOKKOS_COMPILER_HAS_AVX512) - MESSAGE(STATUS "SIMD: AVX512 detected") - SET(KOKKOS_ARCH_AVX512XEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX2) - MESSAGE(STATUS "SIMD: AVX2 detected") - SET(KOKKOS_ARCH_AVX2 ON) - ELSEIF(KOKKOS_COMPILER_HAS_ARM_NEON) - MESSAGE(STATUS "SIMD: ARM_NEON detected") - SET(KOKKOS_ARCH_ARM_NEON ON) - ELSEIF(KOKKOS_COMPILER_HAS_AVX) - MESSAGE(STATUS "SIMD: AVX detected") - SET(KOKKOS_ARCH_AVX ON) - ENDIF() -ENDIF() + if(KOKKOS_COMPILER_HAS_AVX512) + message(STATUS "SIMD: AVX512 detected") + set(KOKKOS_ARCH_AVX512XEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX2) + message(STATUS "SIMD: AVX2 detected") + set(KOKKOS_ARCH_AVX2 ON) + elseif(KOKKOS_COMPILER_HAS_ARM_NEON) + message(STATUS "SIMD: ARM_NEON detected") + set(KOKKOS_ARCH_ARM_NEON ON) + elseif(KOKKOS_COMPILER_HAS_AVX) + message(STATUS "SIMD: AVX detected") + set(KOKKOS_ARCH_AVX ON) + endif() +endif() # FIXME_NVHPC nvc++ doesn't seem to support AVX512. -IF (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) - SET(KOKKOS_ARCH_AVX512XEON OFF) -ENDIF() +if(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL NVHPC) + set(KOKKOS_ARCH_AVX512XEON OFF) +endif() # FIXME_NVCC nvcc doesn't seem to support Arm Neon. -IF(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - UNSET(KOKKOS_ARCH_ARM_NEON) -ENDIF() +if(KOKKOS_ARCH_ARM_NEON AND KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + unset(KOKKOS_ARCH_ARM_NEON) +endif() -IF (NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - Clang -fcuda-rdc - NVIDIA --relocatable-device-code=true - ) - ENDIF() -ENDIF() +if(NOT KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(Clang -fcuda-rdc NVIDIA --relocatable-device-code=true) + endif() +endif() # Clang needs mcx16 option enabled for Windows atomic functions -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) - COMPILER_SPECIFIC_OPTIONS( - Clang -mcx16 - ) -ENDIF() +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND WIN32) + compiler_specific_options(Clang -mcx16) +endif() # MSVC ABI has many deprecation warnings, so ignore them -IF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") - COMPILER_SPECIFIC_DEFS( - Clang _CRT_SECURE_NO_WARNINGS - ) -ENDIF() - +if(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + compiler_specific_defs(Clang _CRT_SECURE_NO_WARNINGS) +endif() #Right now we cannot get the compiler ID when cross-compiling, so just check #that HIP is enabled -IF (KOKKOS_ENABLE_HIP) - IF (KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fgpu-rdc - ) - IF (NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT --hip-link - ) - ENDIF() - ELSE() - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fno-gpu-rdc - ) - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_HIP) + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + compiler_specific_flags(DEFAULT -fgpu-rdc) + if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + compiler_specific_link_options(DEFAULT --hip-link) + endif() + else() + compiler_specific_flags(DEFAULT -fno-gpu-rdc) + endif() +endif() -IF (KOKKOS_ENABLE_SYCL) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization - ) - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-unnamed-lambda - ) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + compiler_specific_flags(DEFAULT -fsycl -fno-sycl-id-queries-fit-in-int -fsycl-dead-args-optimization) + compiler_specific_options(DEFAULT -fsycl-unnamed-lambda) + if(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2024.1.0) + # Before oneAPI 2024.1.0 passing -fno-sycl didn't work properly + if(NOT KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + message(FATAL_ERROR "Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE=OFF requires oneAPI 2024.1.0 or later") + endif() + elseif(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT -fsycl-rdc) + else() + compiler_specific_options(DEFAULT -fno-sycl-rdc) + endif() +endif() # Check support for device_global variables # FIXME_SYCL If SYCL_EXT_ONEAPI_DEVICE_GLOBAL is defined, we can use device @@ -613,17 +734,18 @@ ENDIF() # implementation. Otherwise, the feature is not supported when building shared # libraries. Thus, we don't even check for support if shared libraries are # requested and SYCL_EXT_ONEAPI_DEVICE_GLOBAL is not defined. -IF(KOKKOS_ENABLE_SYCL) - STRING(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") - INCLUDE(CheckCXXSymbolExists) - CHECK_CXX_SYMBOL_EXISTS(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - IF (KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) +if(KOKKOS_ENABLE_SYCL) + string(REPLACE ";" " " CMAKE_REQUIRED_FLAGS "${KOKKOS_COMPILE_OPTIONS}") + include(CheckCXXSymbolExists) + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_DEVICE_GLOBAL "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + if(KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED ON) # Use the non-separable compilation implementation to support shared libraries as well. - COMPILER_SPECIFIC_FLAGS(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) - ELSEIF(NOT BUILD_SHARED_LIBS) - INCLUDE(CheckCXXSourceCompiles) - CHECK_CXX_SOURCE_COMPILES(" + compiler_specific_flags(DEFAULT -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + elseif(NOT BUILD_SHARED_LIBS AND KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " #include using namespace sycl::ext::oneapi::experimental; using namespace sycl; @@ -638,548 +760,617 @@ IF(KOKKOS_ENABLE_SYCL) int main(){ return 0; } " - KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED + ) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED) # Only the separable compilation implementation is supported. - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED + compiler_specific_flags(DEFAULT -fsycl-device-code-split=off -DDESUL_SYCL_DEVICE_GLOBAL_SUPPORTED) + endif() + endif() + + check_cxx_symbol_exists(SYCL_EXT_ONEAPI_GRAPH "sycl/sycl.hpp" KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_GRAPH) +endif() + +set(CUDA_ARCH_ALREADY_SPECIFIED "") +function(CHECK_CUDA_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(CUDA_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." ) - ENDIF() - ENDIF() -ENDIF() - -SET(CUDA_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_CUDA_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${CUDA_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_CUDA AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_SYCL AND NOT KOKKOS_ENABLE_OPENACC) - MESSAGE(WARNING "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_CUDA) - STRING(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) - SET(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) - IF(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - SET(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) - ELSE() - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() - ENDIF() - LIST(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) - SET(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) - LIST(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) - SET(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) -ENDFUNCTION() - + endif() + set(CUDA_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_CUDA + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_SYCL + AND NOT KOKKOS_ENABLE_OPENACC + ) + message( + WARNING + "Given CUDA arch ${ARCH}, but Kokkos_ENABLE_CUDA, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_CUDA) + string(REPLACE "sm_" "" CMAKE_ARCH ${FLAG}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH}) + set(KOKKOS_CUDA_ARCHITECTURES ${CMAKE_ARCH} PARENT_SCOPE) + endif() + set(KOKKOS_CUDA_ARCH_FLAG ${FLAG} PARENT_SCOPE) + if(KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + set(CMAKE_CUDA_ARCHITECTURES ${KOKKOS_CUDA_ARCHITECTURES} PARENT_SCOPE) + else() + global_append(KOKKOS_CUDA_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE OR KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + global_append(KOKKOS_LINK_OPTIONS "${CUDA_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() + endif() + list(APPEND KOKKOS_CUDA_ARCH_FLAGS ${FLAG}) + set(KOKKOS_CUDA_ARCH_FLAGS ${KOKKOS_CUDA_ARCH_FLAGS} PARENT_SCOPE) + list(APPEND KOKKOS_CUDA_ARCH_LIST ${ARCH}) + set(KOKKOS_CUDA_ARCH_LIST ${KOKKOS_CUDA_ARCH_LIST} PARENT_SCOPE) +endfunction() #These will define KOKKOS_CUDA_ARCH_FLAG #to the corresponding flag name if ON -CHECK_CUDA_ARCH(KEPLER30 sm_30) -CHECK_CUDA_ARCH(KEPLER32 sm_32) -CHECK_CUDA_ARCH(KEPLER35 sm_35) -CHECK_CUDA_ARCH(KEPLER37 sm_37) -CHECK_CUDA_ARCH(MAXWELL50 sm_50) -CHECK_CUDA_ARCH(MAXWELL52 sm_52) -CHECK_CUDA_ARCH(MAXWELL53 sm_53) -CHECK_CUDA_ARCH(PASCAL60 sm_60) -CHECK_CUDA_ARCH(PASCAL61 sm_61) -CHECK_CUDA_ARCH(VOLTA70 sm_70) -CHECK_CUDA_ARCH(VOLTA72 sm_72) -CHECK_CUDA_ARCH(TURING75 sm_75) -CHECK_CUDA_ARCH(AMPERE80 sm_80) -CHECK_CUDA_ARCH(AMPERE86 sm_86) -CHECK_CUDA_ARCH(ADA89 sm_89) -CHECK_CUDA_ARCH(HOPPER90 sm_90) +check_cuda_arch(KEPLER30 sm_30) +check_cuda_arch(KEPLER32 sm_32) +check_cuda_arch(KEPLER35 sm_35) +check_cuda_arch(KEPLER37 sm_37) +check_cuda_arch(MAXWELL50 sm_50) +check_cuda_arch(MAXWELL52 sm_52) +check_cuda_arch(MAXWELL53 sm_53) +check_cuda_arch(PASCAL60 sm_60) +check_cuda_arch(PASCAL61 sm_61) +check_cuda_arch(VOLTA70 sm_70) +check_cuda_arch(VOLTA72 sm_72) +check_cuda_arch(TURING75 sm_75) +check_cuda_arch(AMPERE80 sm_80) +check_cuda_arch(AMPERE86 sm_86) +check_cuda_arch(ADA89 sm_89) +check_cuda_arch(HOPPER90 sm_90) -SET(AMDGPU_ARCH_ALREADY_SPECIFIED "") -FUNCTION(CHECK_AMDGPU_ARCH ARCH FLAG) - IF(KOKKOS_ARCH_${ARCH}) - IF(AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again.") - ENDIF() - SET(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) - IF (NOT KOKKOS_ENABLE_HIP AND NOT KOKKOS_ENABLE_OPENMPTARGET AND NOT KOKKOS_ENABLE_OPENACC AND NOT KOKKOS_ENABLE_SYCL) - MESSAGE(WARNING "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored.") - UNSET(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) - ELSE() - IF(KOKKOS_ENABLE_HIP) - SET(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) - ENDIF() - IF(NOT KOKKOS_IMPL_AMDGPU_FLAGS) - SET(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() +set(AMDGPU_ARCH_ALREADY_SPECIFIED "") +function(CHECK_AMDGPU_ARCH ARCH FLAG) + if(KOKKOS_ARCH_${ARCH}) + if(AMDGPU_ARCH_ALREADY_SPECIFIED) + message( + FATAL_ERROR + "Multiple GPU architectures given! Already have ${AMDGPU_ARCH_ALREADY_SPECIFIED}, but trying to add ${ARCH}. If you are re-running CMake, try clearing the cache and running again." + ) + endif() + set(AMDGPU_ARCH_ALREADY_SPECIFIED ${ARCH} PARENT_SCOPE) + if(NOT KOKKOS_ENABLE_HIP + AND NOT KOKKOS_ENABLE_OPENMPTARGET + AND NOT KOKKOS_ENABLE_OPENACC + AND NOT KOKKOS_ENABLE_SYCL + ) + message( + WARNING + "Given AMD GPU architecture ${ARCH}, but Kokkos_ENABLE_HIP, Kokkos_ENABLE_SYCL, Kokkos_ENABLE_OPENACC, and Kokkos_ENABLE_OPENMPTARGET are OFF. Option will be ignored." + ) + unset(KOKKOS_ARCH_${ARCH} PARENT_SCOPE) + else() + if(KOKKOS_ENABLE_HIP) + set(KOKKOS_HIP_ARCHITECTURES ${FLAG} PARENT_SCOPE) + endif() + if(NOT KOKKOS_IMPL_AMDGPU_FLAGS) + set(KOKKOS_AMDGPU_ARCH_FLAG ${FLAG} PARENT_SCOPE) + global_append(KOKKOS_AMDGPU_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + global_append(KOKKOS_LINK_OPTIONS "${AMDGPU_ARCH_FLAG}=${FLAG}") + endif() + endif() + endif() +endfunction() #These will define KOKKOS_AMDGPU_ARCH_FLAG #to the corresponding flag name if ON -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + check_amdgpu_arch(${ARCH} ${FLAG}) +endforeach() -IF(KOKKOS_IMPL_AMDGPU_FLAGS) - IF (NOT AMDGPU_ARCH_ALREADY_SPECIFIED) - MESSAGE(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " - "Please explicitly set the GPU architecture.") - ENDIF() - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") -ENDIF() +if(KOKKOS_IMPL_AMDGPU_FLAGS) + if(NOT AMDGPU_ARCH_ALREADY_SPECIFIED) + message(FATAL_ERROR "When IMPL_AMDGPU_FLAGS is set the architecture autodectection is disabled. " + "Please explicitly set the GPU architecture." + ) + endif() + global_append(KOKKOS_AMDGPU_OPTIONS "${KOKKOS_IMPL_AMDGPU_FLAGS}") + global_append(KOKKOS_LINK_OPTIONS "${KOKKOS_IMPL_AMDGPU_LINK}") +endif() -MACRO(SET_AND_CHECK_AMD_ARCH ARCH FLAG) - KOKKOS_SET_OPTION(ARCH_${ARCH} ON) - CHECK_AMDGPU_ARCH(${ARCH} ${FLAG}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) -ENDMACRO() +macro(SET_AND_CHECK_AMD_ARCH ARCH FLAG) + kokkos_set_option(ARCH_${ARCH} ON) + check_amdgpu_arch(${ARCH} ${FLAG}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCH}) +endmacro() -MACRO(CHECK_MULTIPLE_INTEL_ARCH) - IF(KOKKOS_ARCH_INTEL_GPU) - MESSAGE(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") - ENDIF() - SET(KOKKOS_ARCH_INTEL_GPU ON) -ENDMACRO() +macro(CHECK_MULTIPLE_INTEL_ARCH) + if(KOKKOS_ARCH_INTEL_GPU) + message(FATAL_ERROR "Specifying multiple Intel GPU architectures is not allowed!") + endif() + set(KOKKOS_ARCH_INTEL_GPU ON) +endmacro() -IF(KOKKOS_ARCH_INTEL_GEN) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_DG1) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN9) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN11) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_GEN12LP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_XEHP) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() -IF(KOKKOS_ARCH_INTEL_PVC) - CHECK_MULTIPLE_INTEL_ARCH() -ENDIF() +if(KOKKOS_ARCH_INTEL_GEN) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_DG1) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN9) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN11) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_GEN12LP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_XEHP) + check_multiple_intel_arch() +endif() +if(KOKKOS_ARCH_INTEL_PVC) + check_multiple_intel_arch() +endif() -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - IF (CLANG_CUDA_ARCH) - IF(KOKKOS_CLANG_IS_CRAY) - COMPILER_SPECIFIC_FLAGS( - Cray -fopenmp +if(KOKKOS_ENABLE_OPENMP) + compiler_specific_link_options(CrayClang -fopenmp) +endif() + +if(KOKKOS_ENABLE_OPENMPTARGET) + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + if(CLANG_CUDA_ARCH) + if(KOKKOS_CLANG_IS_CRAY) + compiler_specific_flags(Cray -fopenmp) + else() + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) + compiler_specific_flags( + Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 NVHPC -gpu=${NVHPC_CUDA_ARCH} ) - ELSE() - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${CLANG_CUDA_ARCH}) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target -march=${CLANG_CUDA_ARCH} -fopenmp-targets=nvptx64 - NVHPC -gpu=${NVHPC_CUDA_ARCH} - ) - ENDIF() - ENDIF() - SET(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) - IF (CLANG_AMDGPU_ARCH) - COMPILER_SPECIFIC_FLAGS( + endif() + endif() + set(CLANG_AMDGPU_ARCH ${KOKKOS_AMDGPU_ARCH_FLAG}) + if(CLANG_AMDGPU_ARCH) + compiler_specific_flags( Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${CLANG_AMDGPU_ARCH} -fopenmp-targets=amdgcn-amd-amdhsa ) - ENDIF() - IF (KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__ - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__ - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() + endif() + if(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(IntelLLVM -fopenmp-targets=spir64 -D__STRICT_ANSI__) + else() + compiler_specific_options(IntelLLVM -fopenmp-targets=spir64_gen -D__STRICT_ANSI__) + if(KOKKOS_ARCH_INTEL_GEN9) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + compiler_specific_link_options(IntelLLVM -fopenmp-targets=spir64_gen -Xopenmp-target-backend "-device 12.60.7") + endif() + endif() +endif() -IF (KOKKOS_ENABLE_OPENACC) - IF(KOKKOS_CUDA_ARCH_FLAG) - SET(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - STRING(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc -gpu=${NVHPC_CUDA_ARCH} - Clang -Xopenmp-target=nvptx64-nvidia-cuda -march=${CLANG_CUDA_ARCH} - -fopenmp-targets=nvptx64-nvidia-cuda - ) - ELSEIF(KOKKOS_AMDGPU_ARCH_FLAG) - COMPILER_SPECIFIC_FLAGS( - Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} - -fopenmp-targets=amdgcn-amd-amdhsa - ) - ELSE() - COMPILER_SPECIFIC_FLAGS( - NVHPC -acc - ) - ENDIF() -ENDIF() - -IF (KOKKOS_ENABLE_SYCL) - IF(CUDA_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} +if(KOKKOS_ENABLE_OPENACC) + if(KOKKOS_CUDA_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." ) - ELSE() - MESSAGE(SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(AMDGPU_ARCH_ALREADY_SPECIFIED) - IF(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) - COMPILER_SPECIFIC_FLAGS( + endif() + set(CLANG_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + string(REPLACE "sm_" "cc" NVHPC_CUDA_ARCH ${KOKKOS_CUDA_ARCH_FLAG}) + compiler_specific_flags( + NVHPC + -acc + -gpu=${NVHPC_CUDA_ARCH} + Clang + -Xopenmp-target=nvptx64-nvidia-cuda + -march=${CLANG_CUDA_ARCH} + -fopenmp-targets=nvptx64-nvidia-cuda + ) + if(DEFINED ENV{CUDA_PATH}) + compiler_specific_link_options(Clang -L$ENV{CUDA_PATH}/lib64) + endif() + compiler_specific_libs(Clang -lcudart NVHPC -cuda) + elseif(KOKKOS_AMDGPU_ARCH_FLAG) + if(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + message( + FATAL_ERROR + "If a GPU architecture is specified, Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option cannot be used. Disable the Kokkos_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE option." + ) + endif() + compiler_specific_flags( + Clang -Xopenmp-target=amdgcn-amd-amdhsa -march=${KOKKOS_AMDGPU_ARCH_FLAG} -fopenmp-targets=amdgcn-amd-amdhsa + ) + if(DEFINED ENV{ROCM_PATH}) + compiler_specific_flags(Clang -I$ENV{ROCM_PATH}/include) + compiler_specific_link_options(Clang -L$ENV{ROCM_PATH}/lib) + endif() + compiler_specific_libs(Clang -lamdhip64) + elseif(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + # Compile for kernel execution on the host. In that case, + # memory is shared between the OpenACC space and the host space. + compiler_specific_flags(NVHPC -acc=multicore) + else() + # Automatic fallback mode; try to offload any available GPU, and fall back + # to the host CPU if no available GPU is found. + compiler_specific_flags(NVHPC -acc=gpu,multicore) + message( + STATUS + "No OpenACC target device is specificed; the OpenACC backend will be executed in an automatic fallback mode." + ) + endif() +endif() + +if(KOKKOS_ENABLE_SYCL) + if(CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( + DEFAULT -fsycl-targets=nvptx64-nvidia-cuda -Xsycl-target-backend=nvptx64-nvidia-cuda + --cuda-gpu-arch=${KOKKOS_CUDA_ARCH_FLAG} + ) + else() + message( + SEND_ERROR "Setting a CUDA architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" + ) + endif() + elseif(AMDGPU_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ENABLE_UNSUPPORTED_ARCHS) + compiler_specific_flags( DEFAULT -fsycl-targets=amdgcn-amd-amdhsa -Xsycl-target-backend --offload-arch=${KOKKOS_AMDGPU_ARCH_FLAG} ) - ELSE() - MESSAGE(SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!") - ENDIF() - ELSEIF(KOKKOS_ARCH_INTEL_GEN) - COMPILER_SPECIFIC_FLAGS( - DEFAULT -fsycl-targets=spir64 - ) - ELSE() - COMPILER_SPECIFIC_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen - ) - IF(KOKKOS_ARCH_INTEL_GEN9) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen9" + else() + message( + SEND_ERROR "Setting a AMDGPU architecture for SYCL is only allowed with Kokkos_ENABLE_UNSUPPORTED_ARCHS=ON!" ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN11) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen11" - ) - ELSEIF(KOKKOS_ARCH_INTEL_GEN12LP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device gen12lp" - ) - ELSEIF(KOKKOS_ARCH_INTEL_DG1) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device dg1" - ) - ELSEIF(KOKKOS_ARCH_INTEL_XEHP) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.50.4" - ) - ELSEIF(KOKKOS_ARCH_INTEL_PVC) - COMPILER_SPECIFIC_LINK_OPTIONS( - DEFAULT -fsycl-targets=spir64_gen -Xsycl-target-backend "-device 12.60.7" - ) - ENDIF() - ENDIF() -ENDIF() + endif() + elseif(KOKKOS_ARCH_INTEL_GEN) + compiler_specific_flags(DEFAULT -fsycl-targets=spir64) + elseif(KOKKOS_ARCH_INTEL_GPU) + set(SYCL_TARGET_FLAG -fsycl-targets=spir64_gen) -IF(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) + if(KOKKOS_ARCH_INTEL_GEN9) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen9") + elseif(KOKKOS_ARCH_INTEL_GEN11) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen11") + elseif(KOKKOS_ARCH_INTEL_GEN12LP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device gen12lp") + elseif(KOKKOS_ARCH_INTEL_DG1) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device dg1") + elseif(KOKKOS_ARCH_INTEL_XEHP) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.50.4") + elseif(KOKKOS_ARCH_INTEL_PVC) + set(SYCL_TARGET_BACKEND_FLAG -Xsycl-target-backend "-device 12.60.7") + endif() + + if(Kokkos_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG}) + compiler_specific_link_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + else() + compiler_specific_options(DEFAULT ${SYCL_TARGET_FLAG} ${SYCL_TARGET_BACKEND_FLAG}) + endif() + endif() +endif() + +if(KOKKOS_ENABLE_CUDA AND NOT CUDA_ARCH_ALREADY_SPECIFIED) # Try to autodetect the CUDA Compute Capability by asking the device - SET(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) - FILE(REMOVE_RECURSE ${_BINARY_TEST_DIR}) - FILE(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) + set(_BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/CUDAComputeCapabilityWorkdir) + file(REMOVE_RECURSE ${_BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${_BINARY_TEST_DIR}) - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) # if user is using kokkos_compiler_launcher, above will fail. - IF(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) + if(NOT _COMPILE_RESULT OR NOT _RESULT EQUAL 0) # check to see if CUDA is not already enabled (may happen when Kokkos is subproject) - GET_PROPERTY(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) + get_property(_ENABLED_LANGUAGES GLOBAL PROPERTY ENABLED_LANGUAGES) # language has to be fully enabled, just checking for CMAKE_CUDA_COMPILER isn't enough - IF(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) + if(NOT "CUDA" IN_LIST _ENABLED_LANGUAGES) # make sure the user knows that we aren't using CUDA compiler for anything else - MESSAGE(STATUS "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture...") - INCLUDE(CheckLanguage) - CHECK_LANGUAGE(CUDA) - IF(CMAKE_CUDA_COMPILER) - ENABLE_LANGUAGE(CUDA) - ELSE() - MESSAGE(STATUS "CUDA language could not be enabled") - ENDIF() - ENDIF() + message( + STATUS + "CUDA auto-detection of architecture failed with ${CMAKE_CXX_COMPILER}. Enabling CUDA language ONLY to auto-detect architecture..." + ) + include(CheckLanguage) + check_language(CUDA) + if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + else() + message(STATUS "CUDA language could not be enabled") + endif() + endif() # if CUDA was enabled, this will be defined - IF(CMAKE_CUDA_COMPILER) + if(CMAKE_CUDA_COMPILER) # copy our test to .cu so cmake compiles as CUDA - CONFIGURE_FILE( + configure_file( ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COPYONLY + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COPYONLY ) # run test again - TRY_RUN( - _RESULT - _COMPILE_RESULT - ${_BINARY_TEST_DIR} - ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu - COMPILE_DEFINITIONS -DSM_ONLY - RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY) - ENDIF() - ENDIF() + try_run(_RESULT _COMPILE_RESULT ${_BINARY_TEST_DIR} + ${CMAKE_CURRENT_BINARY_DIR}/compile_tests/cuda_compute_capability.cu COMPILE_DEFINITIONS -DSM_ONLY + RUN_OUTPUT_VARIABLE _CUDA_COMPUTE_CAPABILITY + ) + endif() + endif() - LIST(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) - IF(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) - MESSAGE(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") - LIST(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) - KOKKOS_SET_OPTION(ARCH_${ARCHITECTURE} ON) - CHECK_CUDA_ARCH(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) - LIST(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) - ELSE() - MESSAGE(SEND_ERROR "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " - "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" - "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " - "If you are cross-compiling, you should try to do this on a compute node.") - ENDIF() -ENDIF() + list(FIND KOKKOS_CUDA_ARCH_FLAGS sm_${_CUDA_COMPUTE_CAPABILITY} FLAG_INDEX) + if(_COMPILE_RESULT AND _RESULT EQUAL 0 AND NOT FLAG_INDEX EQUAL -1) + message(STATUS "Detected CUDA Compute Capability ${_CUDA_COMPUTE_CAPABILITY}") + list(GET KOKKOS_CUDA_ARCH_LIST ${FLAG_INDEX} ARCHITECTURE) + kokkos_set_option(ARCH_${ARCHITECTURE} ON) + check_cuda_arch(${ARCHITECTURE} sm_${_CUDA_COMPUTE_CAPABILITY}) + list(APPEND KOKKOS_ENABLED_ARCH_LIST ${ARCHITECTURE}) + else() + message( + SEND_ERROR + "CUDA enabled but no NVIDIA GPU architecture currently enabled and auto-detection failed. " + "Please give one -DKokkos_ARCH_{..}=ON' to enable an NVIDIA GPU architecture.\n" + "You can yourself try to compile ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/cuda_compute_capability.cc and run the executable. " + "If you are cross-compiling, you should try to do this on a compute node." + ) + endif() +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_KEPLER30 OR KOKKOS_ARCH_KEPLER32 OR KOKKOS_ARCH_KEPLER35 OR KOKKOS_ARCH_KEPLER37) - SET(KOKKOS_ARCH_KEPLER ON) -ENDIF() +if(KOKKOS_ARCH_KEPLER30 + OR KOKKOS_ARCH_KEPLER32 + OR KOKKOS_ARCH_KEPLER35 + OR KOKKOS_ARCH_KEPLER37 +) + set(KOKKOS_ARCH_KEPLER ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) - SET(KOKKOS_ARCH_MAXWELL ON) -ENDIF() +if(KOKKOS_ARCH_MAXWELL50 OR KOKKOS_ARCH_MAXWELL52 OR KOKKOS_ARCH_MAXWELL53) + set(KOKKOS_ARCH_MAXWELL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) - SET(KOKKOS_ARCH_PASCAL ON) -ENDIF() +if(KOKKOS_ARCH_PASCAL60 OR KOKKOS_ARCH_PASCAL61) + set(KOKKOS_ARCH_PASCAL ON) +endif() #Regardless of version, make sure we define the general architecture name -IF (KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) - SET(KOKKOS_ARCH_VOLTA ON) -ENDIF() +if(KOKKOS_ARCH_VOLTA70 OR KOKKOS_ARCH_VOLTA72) + set(KOKKOS_ARCH_VOLTA ON) +endif() -IF (KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) - SET(KOKKOS_ARCH_AMPERE ON) -ENDIF() +if(KOKKOS_ARCH_AMPERE80 OR KOKKOS_ARCH_AMPERE86) + set(KOKKOS_ARCH_AMPERE ON) +endif() -IF (KOKKOS_ARCH_HOPPER90) - SET(KOKKOS_ARCH_HOPPER ON) -ENDIF() +if(KOKKOS_ARCH_HOPPER90) + set(KOKKOS_ARCH_HOPPER ON) +endif() + +function(CHECK_AMD_APU ARCH) + set(BINARY_TEST_DIR ${CMAKE_CURRENT_BINARY_DIR}/cmake/compile_tests/AmdApuWorkdir) + file(REMOVE_RECURSE ${BINARY_TEST_DIR}) + file(MAKE_DIRECTORY ${BINARY_TEST_DIR}) + + try_run(RESULT COMPILE_RESULT ${BINARY_TEST_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/cmake/compile_tests/amd_apu.cc + RUN_OUTPUT_VARIABLE AMD_APU + ) + + if(NOT COMPILE_RESULT OR NOT RESULT EQUAL 0) + message(SEND_ERROR "Autodetection of AMD APU failed." + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + + if(AMD_APU) + set(${ARCH} AMD_GFX942_APU PARENT_SCOPE) + endif() +endfunction() #HIP detection of gpu arch -IF(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) - FIND_PROGRAM(ROCM_ENUMERATOR rocm_agent_enumerator) - IF(NOT ROCM_ENUMERATOR) - MESSAGE(FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " - "rocm_agent_enumerator could not be found. " - "Please specify an arch manually via -DKokkos_ARCH_{..}=ON") - ELSE() - EXECUTE_PROCESS(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) - STRING(LENGTH "${GPU_ARCHS}" len_str) +if(KOKKOS_ENABLE_HIP AND NOT AMDGPU_ARCH_ALREADY_SPECIFIED AND NOT KOKKOS_IMPL_AMDGPU_FLAGS) + find_program(ROCM_ENUMERATOR rocm_agent_enumerator) + if(NOT ROCM_ENUMERATOR) + message( + FATAL_ERROR "Autodetection of AMD GPU architecture not possible as " "rocm_agent_enumerator could not be found. " + "Please specify an arch manually via -DKokkos_ARCH_{..}=ON" + ) + else() + execute_process(COMMAND ${ROCM_ENUMERATOR} OUTPUT_VARIABLE GPU_ARCHS) + string(LENGTH "${GPU_ARCHS}" len_str) # enumerator always output gfx000 as the first line - IF(${len_str} LESS 8) - MESSAGE(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - # check for known gpu archs, otherwise error out - ELSE() - SET(AMD_ARCH_DETECTED "") - FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - LIST(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) - LIST(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) - STRING(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) - IF("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") - SET_AND_CHECK_AMD_ARCH(${ARCH} ${FLAG}) - SET(AMD_ARCH_DETECTED ${ARCH}) - BREAK() - ENDIF() - ENDFOREACH() - IF("${AMD_ARCH_DETECTED}" STREQUAL "") - MESSAGE(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " - "is supported. " - "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'.") - ENDIF() - ENDIF() - ENDIF() -ENDIF() + if(${len_str} LESS 8) + message(SEND_ERROR "HIP enabled but no AMD GPU architecture could be automatically detected. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + # check for known gpu archs, otherwise error out + else() + set(AMD_ARCH_DETECTED "") + foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + string(REGEX MATCH "(${FLAG})" DETECTED_GPU_ARCH ${GPU_ARCHS}) + if("${DETECTED_GPU_ARCH}" STREQUAL "${FLAG}") + # If we detected gfx942, we need to discriminate between APU and discrete GPU + if(FLAG STREQUAL "gfx942") + check_amd_apu(ARCH) + endif() + set_and_check_amd_arch(${ARCH} ${FLAG}) + set(AMD_ARCH_DETECTED ${ARCH}) + break() + endif() + endforeach() + if("${AMD_ARCH_DETECTED}" STREQUAL "") + message(FATAL_ERROR "HIP enabled but no automatically detected AMD GPU architecture " "is supported. " + "Please manually specify one AMD GPU architecture via -DKokkos_ARCH_{..}=ON'." + ) + endif() + endif() + endif() +endif() -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - STRING(REGEX MATCH "90A" IS_90A ${ARCH}) - IF(IS_90A) - SET(KOKKOS_ARCH_AMD_GFX90A ON) - SET(KOKKOS_ARCH_VEGA90A ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "908" IS_908 ${ARCH}) - IF(IS_908) - SET(KOKKOS_ARCH_AMD_GFX908 ON) - SET(KOKKOS_ARCH_VEGA908 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "906" IS_906 ${ARCH}) - IF(IS_906) - SET(KOKKOS_ARCH_AMD_GFX906 ON) - SET(KOKKOS_ARCH_VEGA906 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1100" IS_1100 ${ARCH}) - IF(IS_1100) - SET(KOKKOS_ARCH_AMD_GFX1100 ON) - SET(KOKKOS_ARCH_NAVI1100 ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "1030" IS_1030 ${ARCH}) - IF(IS_1030) - SET(KOKKOS_ARCH_AMD_GFX1030 ON) - SET(KOKKOS_ARCH_NAVI1030 ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + string(REGEX MATCH "90A" IS_90A ${ARCH}) + if(IS_90A) + set(KOKKOS_ARCH_AMD_GFX90A ON) + set(KOKKOS_ARCH_VEGA90A ON) + break() + endif() + string(REGEX MATCH "908" IS_908 ${ARCH}) + if(IS_908) + set(KOKKOS_ARCH_AMD_GFX908 ON) + set(KOKKOS_ARCH_VEGA908 ON) + break() + endif() + string(REGEX MATCH "906" IS_906 ${ARCH}) + if(IS_906) + set(KOKKOS_ARCH_AMD_GFX906 ON) + set(KOKKOS_ARCH_VEGA906 ON) + break() + endif() + string(REGEX MATCH "1100" IS_1100 ${ARCH}) + if(IS_1100) + set(KOKKOS_ARCH_AMD_GFX1100 ON) + set(KOKKOS_ARCH_NAVI1100 ON) + break() + endif() + string(REGEX MATCH "1030" IS_1030 ${ARCH}) + if(IS_1030) + set(KOKKOS_ARCH_AMD_GFX1030 ON) + set(KOKKOS_ARCH_NAVI1030 ON) + break() + endif() + endif() +endforeach() #Regardless of version, make sure we define the general architecture name -FOREACH(ARCH IN LISTS SUPPORTED_AMD_ARCHS) - IF (KOKKOS_ARCH_${ARCH}) - SET(KOKKOS_ARCH_AMD_GPU ON) - STRING(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) - IF(IS_VEGA) - SET(KOKKOS_ARCH_VEGA ON) - BREAK() - ENDIF() - STRING(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) - IF(IS_NAVI) - SET(KOKKOS_ARCH_NAVI ON) - BREAK() - ENDIF() - ENDIF() -ENDFOREACH() +foreach(ARCH IN LISTS SUPPORTED_AMD_ARCHS) + if(KOKKOS_ARCH_${ARCH}) + list(FIND SUPPORTED_AMD_ARCHS ${ARCH} LIST_INDEX) + list(GET CORRESPONDING_AMD_FLAGS ${LIST_INDEX} FLAG) + set(KOKKOS_ARCH_AMD_GPU "${FLAG}") + string(REGEX MATCH "(VEGA)" IS_VEGA ${ARCH}) + if(IS_VEGA) + set(KOKKOS_ARCH_VEGA ON) + break() + endif() + string(REGEX MATCH "(NAVI)" IS_NAVI ${ARCH}) + if(IS_NAVI) + set(KOKKOS_ARCH_NAVI ON) + break() + endif() + endif() +endforeach() #CMake verbose is kind of pointless #Let's just always print things -MESSAGE(STATUS "Built-in Execution Spaces:") +message(STATUS "Built-in Execution Spaces:") -FOREACH (_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_DEVICE_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple device parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_DEVICE_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "Cuda") - IF(KOKKOS_ENABLE_CUDA_UVM) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead") - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}UVMSpace") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") - ENDIF() - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - ENDIF() - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSEIF(${_BACKEND} STREQUAL "HIP") - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") - ELSE() - SET(_DEFAULT_DEVICE_MEMSPACE "Kokkos::Experimental::${_BACKEND}Space") - SET(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() -IF(NOT _DEVICE_PARALLEL) - SET(_DEVICE_PARALLEL "NoTypeDefined") - SET(_DEFAULT_DEVICE_MEMSPACE "NoTypeDefined") -ENDIF() -MESSAGE(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") +foreach(_BACKEND Cuda OpenMPTarget HIP SYCL OpenACC) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_DEVICE_PARALLEL) + message( + FATAL_ERROR + "Multiple device parallel execution spaces are not allowed! " + "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_DEVICE_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "Cuda") + if(KOKKOS_ENABLE_CUDA_UVM) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_UVM is deprecated - use the portable Kokkos::SharedSpace as an explicit memory space in your code instead" + ) + if(NOT KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(FATAL_ERROR "Kokkos_ENABLE_DEPRECATED_CODE_4 must be set to use Kokkos_ENABLE_CUDA_UVM") + endif() + endif() + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + elseif(${_BACKEND} STREQUAL "HIP" OR ${_BACKEND} STREQUAL "SYCL") + set(_DEVICE_PARALLEL "Kokkos::${_BACKEND}") + else() + set(_DEVICE_PARALLEL "Kokkos::Experimental::${_BACKEND}") + endif() + endif() +endforeach() +if(NOT _DEVICE_PARALLEL) + set(_DEVICE_PARALLEL "NoTypeDefined") +endif() +message(STATUS " Device Parallel: ${_DEVICE_PARALLEL}") -FOREACH (_BACKEND OpenMP Threads HPX) - STRING(TOUPPER ${_BACKEND} UC_BACKEND) - IF(KOKKOS_ENABLE_${UC_BACKEND}) - IF(_HOST_PARALLEL) - MESSAGE(FATAL_ERROR "Multiple host parallel execution spaces are not allowed! " - "Trying to enable execution space ${_BACKEND}, " - "but execution space ${_HOST_PARALLEL} is already enabled. " - "Remove the CMakeCache.txt file and re-configure.") - ENDIF() - IF (${_BACKEND} STREQUAL "HPX") - SET(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") - ELSE() - SET(_HOST_PARALLEL "Kokkos::${_BACKEND}") - ENDIF() - ENDIF() -ENDFOREACH() +foreach(_BACKEND OpenMP Threads HPX) + string(TOUPPER ${_BACKEND} UC_BACKEND) + if(KOKKOS_ENABLE_${UC_BACKEND}) + if(_HOST_PARALLEL) + message( + FATAL_ERROR + "Multiple host parallel execution spaces are not allowed! " "Trying to enable execution space ${_BACKEND}, " + "but execution space ${_HOST_PARALLEL} is already enabled. " + "Remove the CMakeCache.txt file and re-configure." + ) + endif() + if(${_BACKEND} STREQUAL "HPX") + set(_HOST_PARALLEL "Kokkos::Experimental::${_BACKEND}") + else() + set(_HOST_PARALLEL "Kokkos::${_BACKEND}") + endif() + endif() +endforeach() -IF(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "At least one host execution space must be enabled, " - "but no host parallel execution space was requested " - "and Kokkos_ENABLE_SERIAL=OFF.") -ENDIF() +if(NOT _HOST_PARALLEL AND NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "At least one host execution space must be enabled, " + "but no host parallel execution space was requested " "and Kokkos_ENABLE_SERIAL=OFF." + ) +endif() -IF(_HOST_PARALLEL) -MESSAGE(STATUS " Host Parallel: ${_HOST_PARALLEL}") -ELSE() - SET(_HOST_PARALLEL "NoTypeDefined") - MESSAGE(STATUS " Host Parallel: NoTypeDefined") -ENDIF() +if(_HOST_PARALLEL) + message(STATUS " Host Parallel: ${_HOST_PARALLEL}") +else() + set(_HOST_PARALLEL "NoTypeDefined") + message(STATUS " Host Parallel: NoTypeDefined") +endif() -IF(KOKKOS_ENABLE_SERIAL) - MESSAGE(STATUS " Host Serial: SERIAL") -ELSE() - MESSAGE(STATUS " Host Serial: NONE") -ENDIF() +if(KOKKOS_ENABLE_SERIAL) + message(STATUS " Host Serial: SERIAL") +else() + message(STATUS " Host Serial: NONE") +endif() -MESSAGE(STATUS "") -MESSAGE(STATUS "Architectures:") -FOREACH(Arch ${KOKKOS_ENABLED_ARCH_LIST}) - MESSAGE(STATUS " ${Arch}") -ENDFOREACH() +message(STATUS "") +message(STATUS "Architectures:") +foreach(Arch ${KOKKOS_ENABLED_ARCH_LIST}) + message(STATUS " ${Arch}") +endforeach() - -IF(KOKKOS_ENABLE_ATOMICS_BYPASS) - IF(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") - MESSAGE(FATAL_ERROR "Not allowed to disable atomics (via -DKokkos_ENABLE_AROMICS_BYPASS=ON) if neither a host parallel nor a device backend is enabled!") - ENDIF() - IF(NOT KOKKOS_ENABLE_SERIAL) - MESSAGE(FATAL_ERROR "Implementation bug") # safeguard - ENDIF() - MESSAGE(STATUS "Atomics: **DISABLED**") -ENDIF() +if(KOKKOS_ENABLE_ATOMICS_BYPASS) + if(NOT _HOST_PARALLEL STREQUAL "NoTypeDefined" OR NOT _DEVICE_PARALLEL STREQUAL "NoTypeDefined") + message( + FATAL_ERROR + "Disabling atomics (via -DKokkos_ENABLE_ATOMICS_BYPASS=ON) is not allowed if a host parallel or a device backend is enabled!" + ) + endif() + if(NOT KOKKOS_ENABLE_SERIAL) + message(FATAL_ERROR "Implementation bug") # safeguard + endif() + message(STATUS "Atomics: **DISABLED**") +endif() diff --git a/lib/kokkos/cmake/kokkos_check_env.cmake b/lib/kokkos/cmake/kokkos_check_env.cmake index a455a403b9..f1a309ff85 100644 --- a/lib/kokkos/cmake/kokkos_check_env.cmake +++ b/lib/kokkos/cmake/kokkos_check_env.cmake @@ -1,12 +1,15 @@ -SET(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) -IF (CRAYPE_VERSION) - SET(KOKKOS_IS_CRAYPE TRUE) - SET(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) - IF (CRAYPE_LINK_TYPE) - IF (NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") - MESSAGE(WARNING "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() - ELSE() - MESSAGE(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") - ENDIF() -ENDIF() +set(CRAYPE_VERSION $ENV{CRAYPE_VERSION}) +if(CRAYPE_VERSION) + set(KOKKOS_IS_CRAYPE TRUE) + set(CRAYPE_LINK_TYPE $ENV{CRAYPE_LINK_TYPE}) + if(CRAYPE_LINK_TYPE) + if(NOT CRAYPE_LINK_TYPE STREQUAL "dynamic") + message( + WARNING + "CRAYPE_LINK_TYPE is set to ${CRAYPE_LINK_TYPE}. Linking is likely to fail unless this is set to 'dynamic'" + ) + endif() + else() + message(WARNING "CRAYPE_LINK_TYPE is not set. Linking is likely to fail unless this is set to 'dynamic'") + endif() +endif() diff --git a/lib/kokkos/cmake/kokkos_compiler_id.cmake b/lib/kokkos/cmake/kokkos_compiler_id.cmake index e8bfadb64e..010ed33ede 100644 --- a/lib/kokkos/cmake/kokkos_compiler_id.cmake +++ b/lib/kokkos/cmake/kokkos_compiler_id.cmake @@ -1,262 +1,273 @@ -KOKKOS_CFG_DEPENDS(COMPILER_ID NONE) +kokkos_cfg_depends(COMPILER_ID NONE) -SET(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) -SET(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) -SET(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) +set(KOKKOS_CXX_COMPILER ${CMAKE_CXX_COMPILER}) +set(KOKKOS_CXX_COMPILER_ID ${CMAKE_CXX_COMPILER_ID}) +set(KOKKOS_CXX_COMPILER_VERSION ${CMAKE_CXX_COMPILER_VERSION}) -MACRO(kokkos_internal_have_compiler_nvcc) +macro(kokkos_internal_have_compiler_nvcc) # Check if the compiler is nvcc (which really means nvcc_wrapper). - EXECUTE_PROCESS(COMMAND ${ARGN} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) - STRING(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") - IF(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) - SET(INTERNAL_HAVE_COMPILER_NVCC true) - ELSE() - SET(INTERNAL_HAVE_COMPILER_NVCC false) - ENDIF() -ENDMACRO() + execute_process(COMMAND ${ARGN} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "nvcc" INTERNAL_COMPILER_VERSION_CONTAINS_NVCC) + string(REGEX REPLACE "^ +" "" INTERNAL_HAVE_COMPILER_NVCC "${INTERNAL_HAVE_COMPILER_NVCC}") + if(${INTERNAL_COMPILER_VERSION_CONTAINS_NVCC} GREATER -1) + set(INTERNAL_HAVE_COMPILER_NVCC true) + else() + set(INTERNAL_HAVE_COMPILER_NVCC false) + endif() +endmacro() -IF(Kokkos_ENABLE_CUDA) +if(Kokkos_ENABLE_CUDA) # kokkos_enable_options is not yet called so use lower case here - IF(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + if(Kokkos_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) kokkos_internal_have_compiler_nvcc(${CMAKE_CUDA_COMPILER}) - ELSE() + else() # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) # Check if compiler was set to nvcc_wrapper kokkos_internal_have_compiler_nvcc(${CMAKE_CXX_COMPILER}) # If launcher was found and nvcc_wrapper was not specified as # compiler and `CMAKE_CXX_COMPILIER_LAUNCHER` is not set, set to use launcher. # Will ensure CMAKE_CXX_COMPILER is replaced by nvcc_wrapper - IF(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(CMAKE_CXX_COMPILER_LAUNCHER) - MESSAGE(FATAL_ERROR "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!") - ENDIF() + if(Kokkos_COMPILE_LAUNCHER AND NOT INTERNAL_HAVE_COMPILER_NVCC AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(CMAKE_CXX_COMPILER_LAUNCHER) + message( + FATAL_ERROR + "Cannot use CMAKE_CXX_COMPILER_LAUNCHER if the CMAKE_CXX_COMPILER is not able to compile CUDA code, i.e. nvcc_wrapper or clang++!" + ) + endif() # the first argument to launcher is always the C++ compiler defined by cmake # if the second argument matches the C++ compiler, it forwards the rest of the # args to nvcc_wrapper kokkos_internal_have_compiler_nvcc( - ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} -DKOKKOS_DEPENDENCE) - SET(INTERNAL_USE_COMPILER_LAUNCHER true) - ENDIF() - ENDIF() -ENDIF() + ${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER} ${CMAKE_CXX_COMPILER} + -DKOKKOS_DEPENDENCE + ) + set(INTERNAL_USE_COMPILER_LAUNCHER true) + endif() + endif() +endif() -IF(INTERNAL_HAVE_COMPILER_NVCC) +if(INTERNAL_HAVE_COMPILER_NVCC) # Save the host compiler id before overwriting it. - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) # SET the compiler id to nvcc. We use the value used by CMake 3.8. - SET(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) + set(KOKKOS_CXX_COMPILER_ID NVIDIA CACHE STRING INTERNAL FORCE) - STRING(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - STRING(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") - IF(INTERNAL_USE_COMPILER_LAUNCHER) - MESSAGE(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") + string(REGEX MATCH "V[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + string(SUBSTRING ${TEMP_CXX_COMPILER_VERSION} 1 -1 TEMP_CXX_COMPILER_VERSION) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") + if(INTERNAL_USE_COMPILER_LAUNCHER) + message(STATUS "kokkos_launch_compiler (${Kokkos_COMPILE_LAUNCHER}) is enabled...") kokkos_compilation(GLOBAL) - ENDIF() -ENDIF() + endif() +endif() -IF(Kokkos_ENABLE_HIP) +if(Kokkos_ENABLE_HIP) # get HIP version - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_COMPILER_VERSION OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION} ) + string(REPLACE "\n" " - " INTERNAL_COMPILER_VERSION_ONE_LINE ${INTERNAL_COMPILER_VERSION}) - STRING(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) - IF(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) - SET(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) - ENDIF() + string(FIND ${INTERNAL_COMPILER_VERSION_ONE_LINE} "HIP version" INTERNAL_COMPILER_VERSION_CONTAINS_HIP) + if(INTERNAL_COMPILER_VERSION_CONTAINS_HIP GREATER -1) + set(KOKKOS_CXX_COMPILER_ID HIPCC CACHE STRING INTERNAL FORCE) + endif() - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - MESSAGE(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_COMPILER_VERSION_ONE_LINE}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + message(STATUS "Compiler Version: ${KOKKOS_CXX_COMPILER_VERSION}") +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) # The Cray compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c Cray - OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_CRAY TRUE) - ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c Cray + OUTPUT_VARIABLE INTERNAL_HAVE_CRAY_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_CRAY_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_CRAY TRUE) + set(KOKKOS_CXX_COMPILER_ID CrayClang) + endif() # The clang based Intel compiler reports as Clang to most versions of CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - COMMAND grep -c "DPC++\\|icpx" - OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER - OUTPUT_STRIP_TRAILING_WHITESPACE) - IF (INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang - SET(KOKKOS_CLANG_IS_INTEL TRUE) - SET(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - ENDIF() -ENDIF() + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version + COMMAND grep -c "DPC++\\|icpx" + OUTPUT_VARIABLE INTERNAL_HAVE_INTEL_COMPILER + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + if(INTERNAL_HAVE_INTEL_COMPILER) #not actually Clang + set(KOKKOS_CLANG_IS_INTEL TRUE) + set(KOKKOS_CXX_COMPILER_ID IntelLLVM CACHE STRING INTERNAL FORCE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" KOKKOS_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray OR KOKKOS_CLANG_IS_CRAY) # SET Cray's compiler version. - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - IF (KOKKOS_CLANG_IS_CRAY) - SET(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) - ELSE() - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) - ENDIF() -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + if(KOKKOS_CLANG_IS_CRAY) + set(KOKKOS_CLANG_CRAY_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION}) + else() + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) + endif() +endif() -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) +if(KOKKOS_CXX_COMPILER_ID STREQUAL Fujitsu) # SET Fujitsus compiler version which is not detected by CMake - EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version - OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION - OUTPUT_STRIP_TRAILING_WHITESPACE) + execute_process( + COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE INTERNAL_CXX_COMPILER_VERSION + OUTPUT_STRIP_TRAILING_WHITESPACE + ) - STRING(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" - TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) - SET(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) -ENDIF() + string(REGEX MATCH "[0-9]+\\.[0-9]+\\.[0-9]+" TEMP_CXX_COMPILER_VERSION ${INTERNAL_CXX_COMPILER_VERSION}) + set(KOKKOS_CXX_COMPILER_VERSION ${TEMP_CXX_COMPILER_VERSION} CACHE STRING INTERNAL FORCE) +endif() # Enforce the minimum compilers supported by Kokkos. -IF(NOT CMAKE_CXX_STANDARD) - SET(CMAKE_CXX_STANDARD 17) -ENDIF() -IF(CMAKE_CXX_STANDARD EQUAL 17) - SET(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 8.2.0) - SET(KOKKOS_INTEL_MINIMUM 19.0.5) - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 11.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.29) -ELSE() - SET(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) - SET(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) - SET(KOKKOS_GCC_MINIMUM 10.1.0) - SET(KOKKOS_INTEL_MINIMUM "not supported") - SET(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) - SET(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) - SET(KOKKOS_NVCC_MINIMUM 12.0.0) - SET(KOKKOS_HIPCC_MINIMUM 5.2.0) - SET(KOKKOS_NVHPC_MINIMUM 22.3) - SET(KOKKOS_MSVC_MINIMUM 19.30) -ENDIF() +if(NOT CMAKE_CXX_STANDARD) + set(CMAKE_CXX_STANDARD 17) +endif() +if(CMAKE_CXX_STANDARD EQUAL 17) + set(KOKKOS_CLANG_CPU_MINIMUM 8.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 10.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 8.2.0) + set(KOKKOS_INTEL_MINIMUM 19.0.5) + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2021.1.1) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 11.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.29) +else() + set(KOKKOS_CLANG_CPU_MINIMUM 14.0.0) + set(KOKKOS_CLANG_CUDA_MINIMUM 14.0.0) + set(KOKKOS_CLANG_OPENMPTARGET_MINIMUM 15.0.0) + set(KOKKOS_GCC_MINIMUM 10.1.0) + set(KOKKOS_INTEL_MINIMUM "not supported") + set(KOKKOS_INTEL_LLVM_CPU_MINIMUM 2022.0.0) + set(KOKKOS_INTEL_LLVM_SYCL_MINIMUM 2023.0.0) + set(KOKKOS_NVCC_MINIMUM 12.0.0) + set(KOKKOS_HIPCC_MINIMUM 5.2.0) + set(KOKKOS_NVHPC_MINIMUM 22.3) + set(KOKKOS_MSVC_MINIMUM 19.30) +endif() -SET(KOKKOS_MESSAGE_TEXT "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") -SET(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") +set(KOKKOS_MESSAGE_TEXT + "Compiler not supported by Kokkos for C++${CMAKE_CXX_STANDARD}. Required minimum compiler versions:" +) +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CPU) ${KOKKOS_CLANG_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(CUDA) ${KOKKOS_CLANG_CUDA_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Clang(OpenMPTarget) ${KOKKOS_CLANG_OPENMPTARGET_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n GCC ${KOKKOS_GCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n Intel ${KOKKOS_INTEL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(CPU) ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n IntelLLVM(SYCL) ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVCC ${KOKKOS_NVCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n HIPCC ${KOKKOS_HIPCC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n NVHPC/PGI ${KOKKOS_NVHPC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n MSVC ${KOKKOS_MSVC_MINIMUM}") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\n XL/XLClang not supported") +set(KOKKOS_MESSAGE_TEXT "${KOKKOS_MESSAGE_TEXT}\nCompiler: ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION}\n") -IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - IF((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() - SET(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND NOT Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_CUDA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_CLANG_CUDA_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL GNU) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_GCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + if((NOT CMAKE_CXX_STANDARD EQUAL 17) OR (KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_MINIMUM})) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND NOT Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_CPU_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND Kokkos_ENABLE_SYCL) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_INTEL_LLVM_SYCL_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() + set(CMAKE_CXX_EXTENSIONS OFF CACHE BOOL "Kokkos turns off CXX extensions" FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_HIPCC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL PGI OR KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_NVHPC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() # Treat PGI internally as NVHPC to simplify handling both compilers. # Before CMake 3.20 NVHPC was identified as PGI, nvc++ is # backward-compatible to pgc++. - SET(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") -ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) - MESSAGE(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") - ENDIF() -ENDIF() + set(KOKKOS_CXX_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS ${KOKKOS_MSVC_MINIMUM}) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL XL OR KOKKOS_CXX_COMPILER_ID STREQUAL XLClang) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") +elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Clang AND Kokkos_ENABLE_OPENMPTARGET) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS KOKKOS_CLANG_OPENMPTARGET_MINIMUM) + message(FATAL_ERROR "${KOKKOS_MESSAGE_TEXT}") + endif() +endif() -IF(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) - SET(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) -ELSEIF(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) - SET(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) -ENDIF() +if(NOT DEFINED KOKKOS_CXX_HOST_COMPILER_ID) + set(KOKKOS_CXX_HOST_COMPILER_ID ${KOKKOS_CXX_COMPILER_ID}) +elseif(KOKKOS_CXX_HOST_COMPILER_ID STREQUAL PGI) + set(KOKKOS_CXX_HOST_COMPILER_ID NVHPC CACHE STRING INTERNAL FORCE) +endif() -STRING(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) -LIST(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) -LIST(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) -LIST(LENGTH VERSION_LIST LIST_LENGTH) +string(REPLACE "." ";" VERSION_LIST ${KOKKOS_CXX_COMPILER_VERSION}) +list(GET VERSION_LIST 0 KOKKOS_COMPILER_VERSION_MAJOR) +list(GET VERSION_LIST 1 KOKKOS_COMPILER_VERSION_MINOR) +list(LENGTH VERSION_LIST LIST_LENGTH) # On Android, the compiler doesn't have a patch version, just a major/minor -IF(LIST_LENGTH GREATER 2) - LIST(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) -ELSE() - SET(KOKKOS_COMPILER_VERSION_PATCH 0) -ENDIF() - +if(LIST_LENGTH GREATER 2) + list(GET VERSION_LIST 2 KOKKOS_COMPILER_VERSION_PATCH) +else() + set(KOKKOS_COMPILER_VERSION_PATCH 0) +endif() diff --git a/lib/kokkos/cmake/kokkos_configure_trilinos.cmake b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake new file mode 100644 index 0000000000..5aeef61e7b --- /dev/null +++ b/lib/kokkos/cmake/kokkos_configure_trilinos.cmake @@ -0,0 +1,38 @@ +if(CMAKE_PROJECT_NAME STREQUAL "Trilinos") + set(Kokkos_ENABLE_SERIAL ON CACHE BOOL "Whether to build Serial backend" FORCE) + + if(NOT ${Trilinos_ENABLE_OpenMP} STREQUAL "") + set(Kokkos_ENABLE_OPENMP ${Trilinos_ENABLE_OpenMP} CACHE BOOL "Whether to build OpenMP backend" FORCE) + else() + set(Kokkos_ENABLE_OPENMP OFF CACHE BOOL "Whether to build OpenMP backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_CUDA} STREQUAL "") + set(Kokkos_ENABLE_CUDA ${TPL_ENABLE_CUDA} CACHE BOOL "Whether to build CUDA backend" FORCE) + else() + set(Kokkos_ENABLE_CUDA OFF CACHE BOOL "Whether to build CUDA backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_HPX} STREQUAL "") + set(Kokkos_ENABLE_HPX ${TPL_ENABLE_HPX} CACHE BOOL "Whether to build HPX backend" FORCE) + else() + set(Kokkos_ENABLE_HPX OFF CACHE BOOL "Whether to build HPX backend" FORCE) + endif() + + if(NOT ${TPL_ENABLE_quadmath} STREQUAL "") + set(Kokkos_ENABLE_LIBQUADMATH ${TPL_ENABLE_quadmath} CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + else() + set(Kokkos_ENABLE_LIBQUADMATH OFF CACHE BOOL "Whether to enable the LIBQUADMATH library" FORCE) + endif() + + if(NOT ${TPL_ENABLE_DLlib} STREQUAL "") + set(Kokkos_ENABLE_LIBDL ${TPL_ENABLE_DLlib} CACHE BOOL "Whether to enable the LIBDL library" FORCE) + else() + set(Kokkos_ENABLE_LIBDL OFF CACHE BOOL "Whether to enable the LIBDL library" FORCE) + endif() + + set(Kokkos_ENABLE_COMPLEX_ALIGN OFF CACHE BOOL "Whether to align Kokkos::complex to 2*alignof(RealType)") + + # FIXME_TRILINOS We run into problems when trying to use an external GTest in Trilinos CI + set(CMAKE_DISABLE_FIND_PACKAGE_GTest ON) +endif() diff --git a/lib/kokkos/cmake/kokkos_corner_cases.cmake b/lib/kokkos/cmake/kokkos_corner_cases.cmake index ede2b4e0ca..530e9e8fd8 100644 --- a/lib/kokkos/cmake/kokkos_corner_cases.cmake +++ b/lib/kokkos/cmake/kokkos_corner_cases.cmake @@ -1,4 +1,8 @@ -IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.2) - MESSAGE(WARNING "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496") -ENDIF() - +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND KOKKOS_ENABLE_CUDA_CONSTEXPR AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 11.2 +) + message( + WARNING + "You have requested -DKokkos_ENABLE_CUDA_CONSTEXPR=ON for NVCC ${KOKKOS_CXX_COMPILER_VERSION} which is known to trigger compiler bugs before NVCC version 11.2. See https://github.com/kokkos/kokkos/issues/3496" + ) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_devices.cmake b/lib/kokkos/cmake/kokkos_enable_devices.cmake index c7d189285c..40c2d3ea8a 100644 --- a/lib/kokkos/cmake/kokkos_enable_devices.cmake +++ b/lib/kokkos/cmake/kokkos_enable_devices.cmake @@ -1,128 +1,132 @@ - -FUNCTION(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME}) - LIST(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) +function(KOKKOS_DEVICE_OPTION SUFFIX DEFAULT DEV_TYPE DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME}) + list(APPEND KOKKOS_ENABLED_DEVICES ${SUFFIX}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) - IF (KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") - SET(KOKKOS_HAS_HOST ON PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + set(KOKKOS_ENABLED_DEVICES ${KOKKOS_ENABLED_DEVICES} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) + if(KOKKOS_ENABLE_${UC_NAME} AND DEV_TYPE STREQUAL "HOST") + set(KOKKOS_HAS_HOST ON PARENT_SCOPE) + endif() +endfunction() -KOKKOS_CFG_DEPENDS(DEVICES NONE) +kokkos_cfg_depends(DEVICES NONE) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(DEVICES ENABLE) +kokkos_deprecated_list(DEVICES ENABLE) - -KOKKOS_DEVICE_OPTION(THREADS OFF HOST "Whether to build C++ threads backend") +kokkos_device_option(THREADS OFF HOST "Whether to build C++ threads backend") # detect clang++ / cl / clang-cl clashes -IF (CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") +if(CMAKE_CXX_COMPILER_ID STREQUAL Clang AND "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") # this specific test requires CMake >= 3.15 - IF ("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") + if("x${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}" STREQUAL "xGNU") # use pure clang++ instead of clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC OFF) - ELSE() + set(KOKKOS_COMPILER_CLANG_MSVC OFF) + else() # it defaults to clang-cl - SET(KOKKOS_COMPILER_CLANG_MSVC ON) - ENDIF() -ENDIF() - -IF(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) - SET(OMP_DEFAULT ON) -ELSE() - SET(OMP_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") + set(KOKKOS_COMPILER_CLANG_MSVC ON) + endif() +endif() +if(Trilinos_ENABLE_Kokkos AND Trilinos_ENABLE_OpenMP) + set(OMP_DEFAULT ON) +else() + set(OMP_DEFAULT OFF) +endif() +kokkos_device_option(OPENMP ${OMP_DEFAULT} HOST "Whether to build OpenMP backend") # We want this to default to OFF for cache reasons, but if no # host space is given, then activate serial -IF (KOKKOS_HAS_TRILINOS) - #However, Trilinos always wants Serial ON - SET(SERIAL_DEFAULT ON) -ELSEIF (KOKKOS_HAS_HOST) - SET(SERIAL_DEFAULT OFF) -ELSE() - SET(SERIAL_DEFAULT ON) - IF (NOT DEFINED Kokkos_ENABLE_SERIAL) - MESSAGE(STATUS "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt") - ENDIF() -ENDIF() -KOKKOS_DEVICE_OPTION(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") +if(KOKKOS_HAS_HOST) + set(SERIAL_DEFAULT OFF) +else() + set(SERIAL_DEFAULT ON) + if(NOT DEFINED Kokkos_ENABLE_SERIAL) + message( + STATUS + "SERIAL backend is being turned on to ensure there is at least one Host space. To change this, you must enable another host execution space and configure with -DKokkos_ENABLE_SERIAL=OFF or change CMakeCache.txt" + ) + endif() +endif() +kokkos_device_option(SERIAL ${SERIAL_DEFAULT} HOST "Whether to build serial backend") -KOKKOS_DEVICE_OPTION(HPX OFF HOST "Whether to build HPX backend (experimental)") +kokkos_device_option(HPX OFF HOST "Whether to build HPX backend (experimental)") # Device backends have to come after host backends for header include order reasons # Without this we can't make e.g. CudaSpace accessible by HostSpace -KOKKOS_DEVICE_OPTION(OPENACC OFF DEVICE "Whether to build the OpenACC backend") -IF (KOKKOS_ENABLE_OPENACC) - COMPILER_SPECIFIC_FLAGS( - Clang -fopenacc -fopenacc-fake-async-wait - -Wno-openacc-and-cxx -Wno-openmp-mapping -Wno-unknown-cuda-version - -Wno-pass-failed +kokkos_device_option(OPENACC OFF DEVICE "Whether to build the OpenACC backend") +if(KOKKOS_ENABLE_OPENACC) + compiler_specific_flags( + Clang + -fopenacc + -fopenacc-fake-async-wait + -fopenacc-implicit-worker=vector + -Wno-openacc-and-cxx + -Wno-openmp-mapping + -Wno-unknown-cuda-version + -Wno-pass-failed ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) +endif() + +kokkos_device_option(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") +if(KOKKOS_ENABLE_OPENMPTARGET) + set(ClangOpenMPFlag -fopenmp=libomp) + if(KOKKOS_CLANG_IS_CRAY) + set(ClangOpenMPFlag -fopenmp) + endif() + + compiler_specific_flags( + Clang + ${ClangOpenMPFlag} + -Wno-openmp-mapping + IntelLLVM + -fiopenmp + -Wno-openmp-mapping + NVHPC + -mp=gpu + DEFAULT + -fopenmp ) -ENDIF() + compiler_specific_defs(Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG) + # Are there compilers which identify as Clang and need this library? + # COMPILER_SPECIFIC_LIBS( + # Clang -lopenmptarget + # ) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") + endif() +endif() -KOKKOS_DEVICE_OPTION(OPENMPTARGET OFF DEVICE "Whether to build the OpenMP target backend") -IF (KOKKOS_ENABLE_OPENMPTARGET) - SET(ClangOpenMPFlag -fopenmp=libomp) - IF(KOKKOS_CLANG_IS_CRAY) - SET(ClangOpenMPFlag -fopenmp) - ENDIF() +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) + set(CUDA_DEFAULT ON) +else() + set(CUDA_DEFAULT OFF) +endif() +kokkos_device_option(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - COMPILER_SPECIFIC_FLAGS( - Clang ${ClangOpenMPFlag} -Wno-openmp-mapping - IntelLLVM -fiopenmp -Wno-openmp-mapping - NVHPC -mp=gpu - DEFAULT -fopenmp - ) - COMPILER_SPECIFIC_DEFS( - Clang KOKKOS_WORKAROUND_OPENMPTARGET_CLANG - ) -# Are there compilers which identify as Clang and need this library? -# COMPILER_SPECIFIC_LIBS( -# Clang -lopenmptarget -# ) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "OpenMPTarget backend requires C++17 or newer") - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_CUDA) + global_set(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") + ## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros + list(APPEND DEVICE_SETUP_LIST Cuda) +endif() -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_DEFAULT ON) -ELSE() - SET(CUDA_DEFAULT OFF) -ENDIF() -KOKKOS_DEVICE_OPTION(CUDA ${CUDA_DEFAULT} DEVICE "Whether to build CUDA backend") - -IF (KOKKOS_ENABLE_CUDA) - GLOBAL_SET(KOKKOS_DONT_ALLOW_EXTENSIONS "CUDA enabled") -## Cuda has extra setup requirements, turn on Kokkos_Setup_Cuda.hpp in macros - LIST(APPEND DEVICE_SETUP_LIST Cuda) -ENDIF() - -KOKKOS_DEVICE_OPTION(HIP OFF DEVICE "Whether to build HIP backend") +kokkos_device_option(HIP OFF DEVICE "Whether to build HIP backend") ## HIP has extra setup requirements, turn on Kokkos_Setup_HIP.hpp in macros -IF (KOKKOS_ENABLE_HIP) - LIST(APPEND DEVICE_SETUP_LIST HIP) -ENDIF() +if(KOKKOS_ENABLE_HIP) + list(APPEND DEVICE_SETUP_LIST HIP) +endif() -KOKKOS_DEVICE_OPTION(SYCL OFF DEVICE "Whether to build SYCL backend") +kokkos_device_option(SYCL OFF DEVICE "Whether to build SYCL backend") ## SYCL has extra setup requirements, turn on Kokkos_Setup_SYCL.hpp in macros -IF (KOKKOS_ENABLE_SYCL) - IF(KOKKOS_CXX_STANDARD LESS 17) - MESSAGE(FATAL_ERROR "SYCL backend requires C++17 or newer!") - ENDIF() - LIST(APPEND DEVICE_SETUP_LIST SYCL) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + if(KOKKOS_CXX_STANDARD LESS 17) + message(FATAL_ERROR "SYCL backend requires C++17 or newer!") + endif() + list(APPEND DEVICE_SETUP_LIST SYCL) +endif() diff --git a/lib/kokkos/cmake/kokkos_enable_options.cmake b/lib/kokkos/cmake/kokkos_enable_options.cmake index 53764b0c68..a5d6fdfe4e 100644 --- a/lib/kokkos/cmake/kokkos_enable_options.cmake +++ b/lib/kokkos/cmake/kokkos_enable_options.cmake @@ -1,198 +1,236 @@ ########################## NOTES ############################################### # List the options for configuring kokkos using CMake method of doing it. -# These options then get mapped onto KOKKOS_SETTINGS environment variable by -# kokkos_settings.cmake. It is separate to allow other packages to override -# these variables (e.g., TriBITS). ########################## AVAILABLE OPTIONS ################################### # Use lists for documentation, verification, and programming convenience - -FUNCTION(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) - KOKKOS_OPTION(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) - STRING(TOUPPER ${SUFFIX} UC_NAME) - IF (KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - LIST(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) +function(KOKKOS_ENABLE_OPTION SUFFIX DEFAULT DOCSTRING) + kokkos_option(ENABLE_${SUFFIX} ${DEFAULT} BOOL ${DOCSTRING}) + string(TOUPPER ${SUFFIX} UC_NAME) + if(KOKKOS_ENABLE_${UC_NAME} AND NOT "Kokkos_ENABLE_${UC_NAME}" IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) + list(APPEND KOKKOS_ENABLED_OPTIONS ${UC_NAME}) #I hate that CMake makes me do this - SET(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) - ENDIF() - SET(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) -ENDFUNCTION() + set(KOKKOS_ENABLED_OPTIONS ${KOKKOS_ENABLED_OPTIONS} PARENT_SCOPE) + endif() + set(KOKKOS_ENABLE_${UC_NAME} ${KOKKOS_ENABLE_${UC_NAME}} PARENT_SCOPE) +endfunction() # Certain defaults will depend on knowing the enabled devices -KOKKOS_CFG_DEPENDS(OPTIONS DEVICES) -KOKKOS_CFG_DEPENDS(OPTIONS COMPILER_ID) +kokkos_cfg_depends(OPTIONS DEVICES) +kokkos_cfg_depends(OPTIONS COMPILER_ID) # Put a check in just in case people are using this option -KOKKOS_DEPRECATED_LIST(OPTIONS ENABLE) +kokkos_deprecated_list(OPTIONS ENABLE) -KOKKOS_ENABLE_OPTION(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") -KOKKOS_ENABLE_OPTION(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") -KOKKOS_ENABLE_OPTION(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") +kokkos_enable_option(CUDA_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for CUDA") +kokkos_enable_option(CUDA_UVM OFF "Whether to use unified memory (UM) for CUDA by default") +kokkos_enable_option(CUDA_LDG_INTRINSIC OFF "Whether to use CUDA LDG intrinsics") # In contrast to other CUDA-dependent, options CUDA_LAMBDA is ON by default. # That is problematic when CUDA is not enabled because this not only yields a # bogus warning, but also exports the Kokkos_ENABLE_CUDA_LAMBDA variable and -# sets it to ON. This if-clause is a crutch that delays the refactoring of the -# way we declare all options until after we get rid of TriBITS. -IF (Trilinos_ENABLE_Kokkos AND TPL_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSEIF (KOKKOS_ENABLE_CUDA) - SET(CUDA_LAMBDA_DEFAULT ON) -ELSE() - SET(CUDA_LAMBDA_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_LAMBDA ${CUDA_LAMBDA_DEFAULT} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**") +# sets it to ON. +kokkos_enable_option( + CUDA_LAMBDA ${KOKKOS_ENABLE_CUDA} "Whether to allow lambda expressions on the device with NVCC **DEPRECATED**" +) -# May be used to disable our use of CudaMallocAsync. It had caused issues in -# the past when UCX was used as MPI communication layer. We expect it is -# resolved but we keep the option around a bit longer to be safe. -KOKKOS_ENABLE_OPTION(IMPL_CUDA_MALLOC_ASYNC ON "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") -KOKKOS_ENABLE_OPTION(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") -KOKKOS_ENABLE_OPTION(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") +# As of 09/2024, cudaMallocAsync causes issues with ICP and older version of UCX +# as MPI communication layer. +kokkos_enable_option(IMPL_CUDA_MALLOC_ASYNC OFF "Whether to enable CudaMallocAsync (requires CUDA Toolkit 11.2)") +kokkos_enable_option(IMPL_NVHPC_AS_DEVICE_COMPILER OFF "Whether to allow nvc++ as Cuda device compiler") +kokkos_enable_option(IMPL_CUDA_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for CUDA") -KOKKOS_ENABLE_OPTION(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available" ) -KOKKOS_ENABLE_OPTION(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings" ) -KOKKOS_ENABLE_OPTION(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") -KOKKOS_ENABLE_OPTION(TESTS OFF "Whether to build the unit tests") -KOKKOS_ENABLE_OPTION(BENCHMARKS OFF "Whether to build the benchmarks") -KOKKOS_ENABLE_OPTION(EXAMPLES OFF "Whether to build the examples") -STRING(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) -IF(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") - KOKKOS_ENABLE_OPTION(DEBUG ON "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") -ELSE() - KOKKOS_ENABLE_OPTION(DEBUG OFF "Whether to activate extra debug features - may increase compile times") - KOKKOS_ENABLE_OPTION(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") -ENDIF() -UNSET(_UPPERCASE_CMAKE_BUILD_TYPE) -KOKKOS_ENABLE_OPTION(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") -KOKKOS_ENABLE_OPTION(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") -KOKKOS_ENABLE_OPTION(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") -KOKKOS_ENABLE_OPTION(TUNING OFF "Whether to create bindings for tuning tools") -KOKKOS_ENABLE_OPTION(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") -KOKKOS_ENABLE_OPTION(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") -KOKKOS_ENABLE_OPTION(HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time") -KOKKOS_ENABLE_OPTION(IMPL_HIP_UNIFIED_MEMORY OFF "Whether to leverage unified memory architectures for HIP") +kokkos_enable_option(DEPRECATED_CODE_4 ON "Whether code deprecated in major release 4 is available") +kokkos_enable_option(DEPRECATION_WARNINGS ON "Whether to emit deprecation warnings") +kokkos_enable_option(HIP_RELOCATABLE_DEVICE_CODE OFF "Whether to enable relocatable device code (RDC) for HIP") + +# Disabling RDC only works properly since oneAPI 2024.1.0 +if(KOKKOS_ENABLE_SYCL AND KOKKOS_CXX_COMPILER_ID STREQUAL IntelLLVM AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS + 2024.1.0 +) + set(SYCL_RDC_DEFAULT ON) +else() + set(SYCL_RDC_DEFAULT OFF) +endif() +kokkos_enable_option( + SYCL_RELOCATABLE_DEVICE_CODE ${SYCL_RDC_DEFAULT} "Whether to enable relocatable device code (RDC) for SYCL" +) +kokkos_enable_option(TESTS OFF "Whether to build the unit tests") +kokkos_enable_option(BENCHMARKS OFF "Whether to build the benchmarks") +kokkos_enable_option(EXAMPLES OFF "Whether to build the examples") +string(TOUPPER "${CMAKE_BUILD_TYPE}" UPPERCASE_CMAKE_BUILD_TYPE) +if(UPPERCASE_CMAKE_BUILD_TYPE STREQUAL "DEBUG") + kokkos_enable_option(DEBUG ON "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK ON "Debug check on dual views") +else() + kokkos_enable_option(DEBUG OFF "Whether to activate extra debug features - may increase compile times") + kokkos_enable_option(DEBUG_DUALVIEW_MODIFY_CHECK OFF "Debug check on dual views") +endif() +unset(_UPPERCASE_CMAKE_BUILD_TYPE) +kokkos_enable_option(LARGE_MEM_TESTS OFF "Whether to perform extra large memory tests") +kokkos_enable_option(DEBUG_BOUNDS_CHECK OFF "Whether to use bounds checking - will increase runtime") +kokkos_enable_option(COMPILER_WARNINGS OFF "Whether to print all compiler warnings") +kokkos_enable_option(TUNING OFF "Whether to create bindings for tuning tools") +kokkos_enable_option(AGGRESSIVE_VECTORIZATION OFF "Whether to aggressively vectorize loops") +kokkos_enable_option(COMPILE_AS_CMAKE_LANGUAGE OFF "Whether to use native cmake language support") +kokkos_enable_option( + HIP_MULTIPLE_KERNEL_INSTANTIATIONS OFF + "Whether multiple kernels are instantiated at compile time - improve performance but increase compile time" +) +kokkos_enable_option(IMPL_HIP_MALLOC_ASYNC OFF "Whether to enable hipMallocAsync") +kokkos_enable_option(OPENACC_FORCE_HOST_AS_DEVICE OFF "Whether to force to use host as a target device for OpenACC") # This option will go away eventually, but allows fallback to old implementation when needed. -KOKKOS_ENABLE_OPTION(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") -KOKKOS_ENABLE_OPTION(ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases") -KOKKOS_ENABLE_OPTION(IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting") +kokkos_enable_option(DESUL_ATOMICS_EXTERNAL OFF "Whether to use an external desul installation") +kokkos_enable_option( + ATOMICS_BYPASS OFF "**NOT RECOMMENDED** Whether to make atomics non-atomic for non-threaded MPI-only use cases" +) +kokkos_enable_option( + IMPL_REF_COUNT_BRANCH_UNLIKELY ON "Whether to use the C++20 `[[unlikely]]` attribute in the view reference counting" +) mark_as_advanced(Kokkos_ENABLE_IMPL_REF_COUNT_BRANCH_UNLIKELY) -KOKKOS_ENABLE_OPTION(IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction.") +kokkos_enable_option( + IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND OFF + "Whether to enable a workaround for invalid use of View of Views that causes program hang on destruction." +) mark_as_advanced(Kokkos_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND) -KOKKOS_ENABLE_OPTION(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") -KOKKOS_ENABLE_OPTION(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") -KOKKOS_ENABLE_OPTION(IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan") +kokkos_enable_option(IMPL_MDSPAN ON "Whether to enable experimental mdspan support") +kokkos_enable_option(MDSPAN_EXTERNAL OFF BOOL "Whether to use an external version of mdspan") +kokkos_enable_option( + IMPL_SKIP_COMPILER_MDSPAN ON BOOL "Whether to use an internal version of mdspan even if the compiler supports mdspan" +) mark_as_advanced(Kokkos_ENABLE_IMPL_MDSPAN) mark_as_advanced(Kokkos_ENABLE_MDSPAN_EXTERNAL) mark_as_advanced(Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) -IF (Trilinos_ENABLE_Kokkos) - SET(COMPLEX_ALIGN_DEFAULT OFF) -ELSE() - SET(COMPLEX_ALIGN_DEFAULT ON) -ENDIF() -KOKKOS_ENABLE_OPTION(COMPLEX_ALIGN ${COMPLEX_ALIGN_DEFAULT} "Whether to align Kokkos::complex to 2*alignof(RealType)") +kokkos_enable_option(COMPLEX_ALIGN ON "Whether to align Kokkos::complex to 2*alignof(RealType)") -IF (KOKKOS_ENABLE_TESTS) - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) -ELSE() - SET(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests") -IF (NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) - MESSAGE(WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored.") -ENDIF() +if(KOKKOS_ENABLE_TESTS) + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT ON) +else() + set(HEADER_SELF_CONTAINMENT_TESTS_DEFAULT OFF) +endif() +kokkos_enable_option( + HEADER_SELF_CONTAINMENT_TESTS ${HEADER_SELF_CONTAINMENT_TESTS_DEFAULT} "Enable header self-containment unit tests" +) +if(NOT KOKKOS_ENABLE_TESTS AND KOKKOS_ENABLE_HEADER_SELF_CONTAINMENT_TESTS) + message( + WARNING "Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS is ON but Kokkos_ENABLE_TESTS is OFF. Option will be ignored." + ) +endif() -IF (KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) - SET(CUDA_CONSTEXPR_DEFAULT ON) -ELSE() - SET(CUDA_CONSTEXPR_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions") +if(KOKKOS_ENABLE_CUDA AND (KOKKOS_CXX_COMPILER_ID STREQUAL Clang)) + set(CUDA_CONSTEXPR_DEFAULT ON) +else() + set(CUDA_CONSTEXPR_DEFAULT OFF) +endif() +kokkos_enable_option( + CUDA_CONSTEXPR ${CUDA_CONSTEXPR_DEFAULT} "Whether to activate experimental relaxed constexpr functions" +) -IF (KOKKOS_ENABLE_HPX) - SET(HPX_ASYNC_DISPATCH_DEFAULT ON) -ELSE() - SET(HPX_ASYNC_DISPATCH_DEFAULT OFF) -ENDIF() -KOKKOS_ENABLE_OPTION(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") +if(KOKKOS_ENABLE_HPX) + set(HPX_ASYNC_DISPATCH_DEFAULT ON) +else() + set(HPX_ASYNC_DISPATCH_DEFAULT OFF) +endif() +kokkos_enable_option(IMPL_HPX_ASYNC_DISPATCH ${HPX_ASYNC_DISPATCH_DEFAULT} "Whether HPX supports asynchronous dispatch") -Kokkos_ENABLE_OPTION(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") +kokkos_enable_option(UNSUPPORTED_ARCHS OFF "Whether to allow architectures in backends Kokkos doesn't optimize for") -FUNCTION(check_device_specific_options) - CMAKE_PARSE_ARGUMENTS(SOME "" "DEVICE" "OPTIONS" ${ARGN}) - IF(NOT KOKKOS_ENABLE_${SOME_DEVICE}) - FOREACH(OPTION ${SOME_OPTIONS}) - IF(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) - MESSAGE(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") - ENDIF() - IF(KOKKOS_ENABLE_${OPTION}) - MESSAGE(WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored.") - UNSET(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() +function(check_device_specific_options) + cmake_parse_arguments(SOME "" "DEVICE" "OPTIONS" ${ARGN}) + if(NOT KOKKOS_ENABLE_${SOME_DEVICE}) + foreach(OPTION ${SOME_OPTIONS}) + if(NOT DEFINED CACHE{Kokkos_ENABLE_${OPTION}} OR NOT DEFINED CACHE{Kokkos_ENABLE_${SOME_DEVICE}}) + message(FATAL_ERROR "Internal logic error: option '${OPTION}' or device '${SOME_DEVICE}' not recognized.") + endif() + if(KOKKOS_ENABLE_${OPTION}) + message( + WARNING "Kokkos_ENABLE_${OPTION} is ON but ${SOME_DEVICE} backend is not enabled. Option will be ignored." + ) + unset(KOKKOS_ENABLE_${OPTION} PARENT_SCOPE) + endif() + endforeach() + endif() +endfunction() -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE CUDA OPTIONS CUDA_UVM CUDA_RELOCATABLE_DEVICE_CODE CUDA_LAMBDA CUDA_CONSTEXPR CUDA_LDG_INTRINSIC IMPL_CUDA_UNIFIED_MEMORY) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE) -CHECK_DEVICE_SPECIFIC_OPTIONS(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options( + DEVICE + CUDA + OPTIONS + CUDA_UVM + CUDA_RELOCATABLE_DEVICE_CODE + CUDA_LAMBDA + CUDA_CONSTEXPR + CUDA_LDG_INTRINSIC + IMPL_CUDA_MALLOC_ASYNC + IMPL_CUDA_UNIFIED_MEMORY +) +check_device_specific_options( + DEVICE HIP OPTIONS HIP_RELOCATABLE_DEVICE_CODE HIP_MULTIPLE_KERNEL_INSTANTIATIONS IMPL_HIP_MALLOC_ASYNC +) +check_device_specific_options(DEVICE HPX OPTIONS IMPL_HPX_ASYNC_DISPATCH) +check_device_specific_options(DEVICE OPENACC OPTIONS OPENACC_FORCE_HOST_AS_DEVICE) # Needed due to change from deprecated name to new header define name -IF (KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) - SET(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) -ENDIF() +if(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) + set(KOKKOS_OPT_RANGE_AGGRESSIVE_VECTORIZATION ON) +endif() # Force consistency of KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE # and CMAKE_CUDA_SEPARABLE_COMPILATION when we are compiling # using the CMake CUDA language support. # Either one being on will turn the other one on. -IF (KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - IF (NOT CMAKE_CUDA_SEPARABLE_COMPILATION) - MESSAGE(STATUS "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support") - SET(CMAKE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - ELSE() - IF (CMAKE_CUDA_SEPARABLE_COMPILATION) - SET(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) - ENDIF() - ENDIF() -ENDIF() +if(KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + if(NOT CMAKE_CUDA_SEPARABLE_COMPILATION) + message( + STATUS + "Setting CMAKE_CUDA_SEPARABLE_COMPILATION=ON since Kokkos_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE is true. When compiling Kokkos with CMake language CUDA, please use CMAKE_CUDA_SEPARABLE_COMPILATION to control RDC support" + ) + set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) + endif() + else() + if(CMAKE_CUDA_SEPARABLE_COMPILATION) + set(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE ON) + endif() + endif() +endif() # This is known to occur with Clang 9. We would need to use nvcc as the linker # http://lists.llvm.org/pipermail/cfe-dev/2018-June/058296.html # TODO: Through great effort we can use a different linker by hacking # CMAKE_CXX_LINK_EXECUTABLE in a future release -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - MESSAGE(FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC") -ENDIF() +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + message( + FATAL_ERROR "Relocatable device code is currently not supported with Clang - must use nvcc_wrapper or turn off RDC" + ) +endif() -IF (KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) - MESSAGE(FATAL_ERROR "Relocatable device code requires static libraries.") -ENDIF() +if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE AND BUILD_SHARED_LIBS) + message(FATAL_ERROR "Relocatable device code requires static libraries.") +endif() -IF(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") - ENDIF() -ENDIF() -IF(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) - IF(KOKKOS_ENABLE_DEPRECATED_CODE_4) - MESSAGE(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON") +if(Kokkos_ENABLE_CUDA_LDG_INTRINSIC) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message(DEPRECATION "Setting Kokkos_ENABLE_CUDA_LDG_INTRINSIC is deprecated. LDG intrinsics are always enabled.") + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LDG_INTRINSIC has been removed. LDG intrinsics are always enabled.") + endif() +endif() +if(Kokkos_ENABLE_CUDA AND NOT Kokkos_ENABLE_CUDA_LAMBDA) + if(KOKKOS_ENABLE_DEPRECATED_CODE_4) + message( + DEPRECATION + "Setting Kokkos_ENABLE_CUDA_LAMBDA is deprecated. Lambda expressions in device code are always enabled. Forcing -DKokkos_ENABLE_CUDA_LAMBDA=ON" + ) set(Kokkos_ENABLE_CUDA_LAMBDA ON CACHE BOOL "Kokkos turned Cuda lambda support ON!" FORCE) set(KOKKOS_ENABLE_CUDA_LAMBDA ON) - ELSE() - MESSAGE(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") - ENDIF() -ENDIF() + else() + message(FATAL_ERROR "Kokkos_ENABLE_CUDA_LAMBDA has been removed. Lambda expressions in device code always enabled.") + endif() +endif() - -IF(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) - MESSAGE(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") -ENDIF() +if(DEFINED Kokkos_ENABLE_IMPL_DESUL_ATOMICS) + message(WARNING "Kokkos_ENABLE_IMPL_DESUL_ATOMICS option has been removed. Desul atomics cannot be disabled.") +endif() diff --git a/lib/kokkos/cmake/kokkos_functions.cmake b/lib/kokkos/cmake/kokkos_functions.cmake index d1f1e0d7a7..38eedd8362 100644 --- a/lib/kokkos/cmake/kokkos_functions.cmake +++ b/lib/kokkos/cmake/kokkos_functions.cmake @@ -5,12 +5,8 @@ # Validate options are given with correct case and define an internal # upper-case version for use within -set(Kokkos_OPTIONS_NOT_TO_EXPORT - Kokkos_ENABLE_BENCHMARKS - Kokkos_ENABLE_EXAMPLES - Kokkos_ENABLE_TESTS - Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS - Kokkos_ENABLE_COMPILER_WARNINGS +set(Kokkos_OPTIONS_NOT_TO_EXPORT Kokkos_ENABLE_BENCHMARKS Kokkos_ENABLE_EXAMPLES Kokkos_ENABLE_TESTS + Kokkos_ENABLE_HEADER_SELF_CONTAINMENT_TESTS Kokkos_ENABLE_COMPILER_WARNINGS ) # @@ -22,139 +18,122 @@ set(Kokkos_OPTIONS_NOT_TO_EXPORT # It attempts to print a helpful message about updating the options for the new CMake. # Kokkos_${SUFFIX} is the name of the option (like Kokkos_ARCH) being checked. # Kokkos_${PREFIX}_X is the name of new option to be defined from a list X,Y,Z,... -FUNCTION(kokkos_deprecated_list SUFFIX PREFIX) - SET(CAMEL_NAME Kokkos_${SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_deprecated_list SUFFIX PREFIX) + set(CAMEL_NAME Kokkos_${SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) #I don't love doing it this way but better to be safe - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - STRING(REPLACE "," ";" optlist "${${opt}}") - SET(ERROR_MSG "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:") - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - STRING(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") - ENDFOREACH() - STRING(APPEND ERROR_MSG "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it).") - IF (KOKKOS_HAS_TRILINOS) - MESSAGE(WARNING ${ERROR_MSG}) - FOREACH(entry ${optlist}) - STRING(TOUPPER ${entry} ENTRY_UC) - SET(${CAMEL_NAME}_${ENTRY_UC} ON CACHE BOOL "Deprecated Trilinos translation") - ENDFOREACH() - UNSET(${opt} CACHE) - ELSE() - MESSAGE(SEND_ERROR ${ERROR_MSG}) - ENDIF() - ENDIF() - ENDFOREACH() -ENDFUNCTION() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + string(REPLACE "," ";" optlist "${${opt}}") + set(ERROR_MSG + "Given deprecated option list ${opt}. This must now be given as separate -D options, which assuming you spelled options correctly would be:" + ) + foreach(entry ${optlist}) + string(TOUPPER ${entry} ENTRY_UC) + string(APPEND ERROR_MSG "\n -DKokkos_${PREFIX}_${ENTRY_UC}=ON") + endforeach() + string( + APPEND + ERROR_MSG + "\nRemove CMakeCache.txt and re-run. For a list of valid options, refer to BUILD.md or even look at CMakeCache.txt (before deleting it)." + ) + message(SEND_ERROR ${ERROR_MSG}) + endif() + endforeach() +endfunction() -FUNCTION(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_option CAMEL_SUFFIX DEFAULT TYPE DOCSTRING) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES ${TYPE}) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES ${TYPE}) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) # Make sure this appears in the cache with the appropriate DOCSTRING - SET(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) - - IF (KOKKOS_HAS_TRILINOS) - IF (NOT CAMEL_NAME IN_LIST Kokkos_OPTIONS_NOT_TO_EXPORT) - TRIBITS_PKG_EXPORT_CACHE_VAR(${CAMEL_NAME}) - ENDIF() - ENDIF() + set(${CAMEL_NAME} ${DEFAULT} CACHE ${TYPE} ${DOCSTRING}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() -INCLUDE (CMakeDependentOption) -FUNCTION(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +include(CMakeDependentOption) +function(kokkos_dependent_option CAMEL_SUFFIX DOCSTRING DEFAULT DEPENDENCY FORCE) + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) - SET(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") - SET(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) - LIST(APPEND KOKKOS_OPTION_TYPES BOOL) - SET(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX}) + set(KOKKOS_OPTION_KEYS ${KOKKOS_OPTION_KEYS} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_VALUES "${DOCSTRING}") + set(KOKKOS_OPTION_VALUES ${KOKKOS_OPTION_VALUES} PARENT_SCOPE) + list(APPEND KOKKOS_OPTION_TYPES BOOL) + set(KOKKOS_OPTION_TYPES ${KOKKOS_OPTION_TYPES} PARENT_SCOPE) - CMAKE_DEPENDENT_OPTION(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) + cmake_dependent_option(${CAMEL_NAME} ${DOCSTRING} ${DEFAULT} "${DEPENDENCY}" ${FORCE}) #I don't love doing it this way because it's N^2 in number options, but c'est la vie - FOREACH(opt ${KOKKOS_GIVEN_VARIABLES}) - STRING(TOUPPER ${opt} OPT_UC) - IF ("${OPT_UC}" STREQUAL "${UC_NAME}") - IF (NOT "${opt}" STREQUAL "${CAMEL_NAME}") - IF (KOKKOS_HAS_TRILINOS) - #Allow this for now if Trilinos... we need to bootstrap our way to integration - MESSAGE(WARNING "Deprecated option ${opt} found - please change spelling to ${CAMEL_NAME}") - SET(${CAMEL_NAME} "${${opt}}" CACHE ${TYPE} ${DOCSTRING} FORCE) - UNSET(${opt} CACHE) - ELSE() - MESSAGE(FATAL_ERROR "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies.") - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + foreach(opt ${KOKKOS_GIVEN_VARIABLES}) + string(TOUPPER ${opt} OPT_UC) + if("${OPT_UC}" STREQUAL "${UC_NAME}") + if(NOT "${opt}" STREQUAL "${CAMEL_NAME}") + message( + FATAL_ERROR + "Matching option found for ${CAMEL_NAME} with the wrong case ${opt}. Please delete your CMakeCache.txt and change option to -D${CAMEL_NAME}=${${opt}}. This is now enforced to avoid hard-to-debug CMake cache inconsistencies." + ) + endif() + endif() + endforeach() #okay, great, we passed the validation test - use the default - IF (DEFINED ${CAMEL_NAME}) - SET(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) - ELSE() - SET(${UC_NAME} ${DEFAULT} PARENT_SCOPE) - ENDIF() -ENDFUNCTION() + if(DEFINED ${CAMEL_NAME}) + set(${UC_NAME} ${${CAMEL_NAME}} PARENT_SCOPE) + else() + set(${UC_NAME} ${DEFAULT} PARENT_SCOPE) + endif() +endfunction() -FUNCTION(kokkos_set_option CAMEL_SUFFIX VALUE) - LIST(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) - IF(OPTION_INDEX EQUAL -1) - MESSAGE(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") - ENDIF() - SET(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) - STRING(TOUPPER ${CAMEL_NAME} UC_NAME) +function(kokkos_set_option CAMEL_SUFFIX VALUE) + list(FIND KOKKOS_OPTION_KEYS ${CAMEL_SUFFIX} OPTION_INDEX) + if(OPTION_INDEX EQUAL -1) + message(FATAL_ERROR "Couldn't set value for Kokkos_${CAMEL_SUFFIX}") + endif() + set(CAMEL_NAME Kokkos_${CAMEL_SUFFIX}) + string(TOUPPER ${CAMEL_NAME} UC_NAME) - LIST(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) - LIST(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) - SET(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) - MESSAGE(STATUS "Setting ${CAMEL_NAME}=${VALUE}") - SET(${UC_NAME} ${VALUE} PARENT_SCOPE) -ENDFUNCTION() + list(GET KOKKOS_OPTION_VALUES ${OPTION_INDEX} DOCSTRING) + list(GET KOKKOS_OPTION_TYPES ${OPTION_INDEX} TYPE) + set(${CAMEL_NAME} ${VALUE} CACHE ${TYPE} ${DOCSTRING} FORCE) + message(STATUS "Setting ${CAMEL_NAME}=${VALUE}") + set(${UC_NAME} ${VALUE} PARENT_SCOPE) +endfunction() -FUNCTION(kokkos_append_config_line LINE) - GLOBAL_APPEND(KOKKOS_TPL_EXPORTS "${LINE}") -ENDFUNCTION() +function(kokkos_append_config_line LINE) + global_append(KOKKOS_TPL_EXPORTS "${LINE}") +endfunction() -MACRO(kokkos_export_cmake_tpl NAME) +macro(kokkos_export_cmake_tpl NAME) cmake_parse_arguments(KOKKOS_EXTRA_ARG "REQUIRED" "" "COMPONENTS" ${ARGN}) #CMake TPLs are located with a call to find_package @@ -163,91 +142,88 @@ MACRO(kokkos_export_cmake_tpl NAME) #If Kokkos was configured to find the TPL through a _DIR variable #make sure thar DIR variable is available to downstream packages - IF (DEFINED ${NAME}_DIR) + if(DEFINED ${NAME}_DIR) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_DIR)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_DIR ${${NAME}_DIR})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_DIR)") + kokkos_append_config_line(" SET(${NAME}_DIR ${${NAME}_DIR})") + kokkos_append_config_line("ENDIF()") + endif() - IF (DEFINED ${NAME}_ROOT) + if(DEFINED ${NAME}_ROOT) #The downstream project may override the TPL location that Kokkos used #Check if the downstream project chose its own TPL location #If not, make the Kokkos found location available - KOKKOS_APPEND_CONFIG_LINE("IF(NOT DEFINED ${NAME}_ROOT)") - KOKKOS_APPEND_CONFIG_LINE(" SET(${NAME}_ROOT ${${NAME}_ROOT})") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - SET(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") + kokkos_append_config_line("IF(NOT DEFINED ${NAME}_ROOT)") + kokkos_append_config_line(" SET(${NAME}_ROOT ${${NAME}_ROOT})") + kokkos_append_config_line("ENDIF()") + endif() + set(KOKKOS_CONFIG_STRING "FIND_DEPENDENCY(${NAME}") - IF(KOKKOS_EXTRA_ARG_REQUIRED) - STRING(APPEND KOKKOS_CONFIG_STRING " REQUIRED") - ENDIF() - IF(KOKKOS_EXTRA_ARG_COMPONENTS) - STRING(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") - ENDIF() - STRING(APPEND KOKKOS_CONFIG_STRING ")") - KOKKOS_APPEND_CONFIG_LINE(${KOKKOS_CONFIG_STRING}) -ENDMACRO() + if(KOKKOS_EXTRA_ARG_REQUIRED) + string(APPEND KOKKOS_CONFIG_STRING " REQUIRED") + endif() + if(KOKKOS_EXTRA_ARG_COMPONENTS) + string(APPEND KOKKOS_CONFIG_STRING " COMPONENTS ${KOKKOS_EXTRA_ARG_COMPONENTS}") + endif() + string(APPEND KOKKOS_CONFIG_STRING ")") + kokkos_append_config_line(${KOKKOS_CONFIG_STRING}) +endmacro() -MACRO(kokkos_export_imported_tpl NAME) - IF (NOT KOKKOS_HAS_TRILINOS) - GET_TARGET_PROPERTY(LIB_IMPORTED ${NAME} IMPORTED) - IF (NOT LIB_IMPORTED) - # This is not an imported target - # This an interface library that we created - INSTALL( - TARGETS ${NAME} - EXPORT KokkosTargets - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - ) - ELSE() - #make sure this also gets "exported" in the config file - KOKKOS_APPEND_CONFIG_LINE("IF(NOT TARGET ${NAME})") +macro(kokkos_export_imported_tpl NAME) + get_target_property(LIB_IMPORTED ${NAME} IMPORTED) + if(NOT LIB_IMPORTED) + # This is not an imported target + # This an interface library that we created + install( + TARGETS ${NAME} + EXPORT KokkosTargets + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + ) + else() + #make sure this also gets "exported" in the config file + kokkos_append_config_line("IF(NOT TARGET ${NAME})") - GET_TARGET_PROPERTY(LIB_TYPE ${NAME} TYPE) - IF (${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - ELSE() - KOKKOS_APPEND_CONFIG_LINE("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") - KOKKOS_APPEND_CONFIG_LINE("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") - GET_TARGET_PROPERTY(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) - IF(TPL_LIBRARY) - KOKKOS_APPEND_CONFIG_LINE("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") - ENDIF() - ENDIF() + get_target_property(LIB_TYPE ${NAME} TYPE) + if(${LIB_TYPE} STREQUAL "INTERFACE_LIBRARY") + kokkos_append_config_line("ADD_LIBRARY(${NAME} INTERFACE IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + else() + kokkos_append_config_line("ADD_LIBRARY(${NAME} UNKNOWN IMPORTED)") + kokkos_append_config_line("SET_TARGET_PROPERTIES(${NAME} PROPERTIES") + get_target_property(TPL_LIBRARY ${NAME} IMPORTED_LOCATION) + if(TPL_LIBRARY) + kokkos_append_config_line("IMPORTED_LOCATION \"${TPL_LIBRARY}\"") + endif() + endif() - GET_TARGET_PROPERTY(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) - IF(TPL_INCLUDES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") - ENDIF() + get_target_property(TPL_INCLUDES ${NAME} INTERFACE_INCLUDE_DIRECTORIES) + if(TPL_INCLUDES) + kokkos_append_config_line("INTERFACE_INCLUDE_DIRECTORIES \"${TPL_INCLUDES}\"") + endif() - GET_TARGET_PROPERTY(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) - IF(TPL_COMPILE_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") - ENDIF() + get_target_property(TPL_COMPILE_OPTIONS ${NAME} INTERFACE_COMPILE_OPTIONS) + if(TPL_COMPILE_OPTIONS) + kokkos_append_config_line("INTERFACE_COMPILE_OPTIONS ${TPL_COMPILE_OPTIONS}") + endif() - SET(TPL_LINK_OPTIONS) - GET_TARGET_PROPERTY(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) - IF(TPL_LINK_OPTIONS) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") - ENDIF() - - GET_TARGET_PROPERTY(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) - IF(TPL_LINK_LIBRARIES) - KOKKOS_APPEND_CONFIG_LINE("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") - ENDIF() - KOKKOS_APPEND_CONFIG_LINE(")") - KOKKOS_APPEND_CONFIG_LINE("ENDIF()") - ENDIF() - ENDIF() -ENDMACRO() + set(TPL_LINK_OPTIONS) + get_target_property(TPL_LINK_OPTIONS ${NAME} INTERFACE_LINK_OPTIONS) + if(TPL_LINK_OPTIONS) + kokkos_append_config_line("INTERFACE_LINK_OPTIONS ${TPL_LINK_OPTIONS}") + endif() + get_target_property(TPL_LINK_LIBRARIES ${NAME} INTERFACE_LINK_LIBRARIES) + if(TPL_LINK_LIBRARIES) + kokkos_append_config_line("INTERFACE_LINK_LIBRARIES \"${TPL_LINK_LIBRARIES}\"") + endif() + kokkos_append_config_line(")") + kokkos_append_config_line("ENDIF()") + endif() +endmacro() # # @MACRO: KOKKOS_IMPORT_TPL() @@ -271,57 +247,43 @@ ENDMACRO() # # If specified, this TPL will build an INTERFACE library rather than an # IMPORTED target -IF (KOKKOS_HAS_TRILINOS) -MACRO(kokkos_import_tpl NAME) - #do nothing -ENDMACRO() -ELSE() -MACRO(kokkos_import_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT;INTERFACE" - "" - "" - ${ARGN}) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() +macro(kokkos_import_tpl NAME) + cmake_parse_arguments(TPL "NO_EXPORT;INTERFACE" "" "" ${ARGN}) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() - IF (KOKKOS_ENABLE_${NAME}) + if(KOKKOS_ENABLE_${NAME}) #Tack on a TPL here to make sure we avoid using anyone else's find - FIND_PACKAGE(TPL${NAME} REQUIRED MODULE) - IF(NOT TARGET ${TPL_IMPORTED_NAME}) - MESSAGE(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") - ENDIF() - IF(NOT TPL_NO_EXPORT) - GET_TARGET_PROPERTY(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) - IF (NOT TPL_ORIGINAL_NAME) - SET(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) - ENDIF() - KOKKOS_EXPORT_IMPORTED_TPL(${TPL_ORIGINAL_NAME}) - ENDIF() - LIST(APPEND KOKKOS_ENABLED_TPLS ${NAME}) - ENDIF() -ENDMACRO(kokkos_import_tpl) -ENDIF() + find_package(TPL${NAME} REQUIRED MODULE) + if(NOT TARGET ${TPL_IMPORTED_NAME}) + message(FATAL_ERROR "Find module succeeded for ${NAME}, but did not produce valid target ${TPL_IMPORTED_NAME}") + endif() + if(NOT TPL_NO_EXPORT) + get_target_property(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME} ALIASED_TARGET) + if(NOT TPL_ORIGINAL_NAME) + set(TPL_ORIGINAL_NAME ${TPL_IMPORTED_NAME}) + endif() + kokkos_export_imported_tpl(${TPL_ORIGINAL_NAME}) + endif() + list(APPEND KOKKOS_ENABLED_TPLS ${NAME}) + endif() +endmacro(kokkos_import_tpl) -MACRO(kokkos_import_cmake_tpl MODULE_NAME) +macro(kokkos_import_cmake_tpl MODULE_NAME) kokkos_import_tpl(${MODULE_NAME} ${ARGN} NO_EXPORT) - CMAKE_PARSE_ARGUMENTS(TPL - "NO_EXPORT" - "OPTION_NAME" - "" - ${ARGN}) + cmake_parse_arguments(TPL "NO_EXPORT" "OPTION_NAME" "" ${ARGN}) - IF (NOT TPL_OPTION_NAME) - SET(TPL_OPTION_NAME ${MODULE_NAME}) - ENDIF() + if(NOT TPL_OPTION_NAME) + set(TPL_OPTION_NAME ${MODULE_NAME}) + endif() - IF (NOT TPL_NO_EXPORT) - KOKKOS_EXPORT_CMAKE_TPL(${MODULE_NAME}) - ENDIF() -ENDMACRO() + if(NOT TPL_NO_EXPORT) + kokkos_export_cmake_tpl(${MODULE_NAME}) + endif() +endmacro() # # @MACRO: KOKKOS_CREATE_IMPORTED_TPL() @@ -368,68 +330,57 @@ ENDMACRO() # # If specified, this gives a list of linker flags that must be used # for using this library. -MACRO(kokkos_create_imported_tpl NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE" - "LIBRARY" - "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" - ${ARGN}) +macro(kokkos_create_imported_tpl NAME) + cmake_parse_arguments( + TPL "INTERFACE" "LIBRARY" "LINK_LIBRARIES;INCLUDES;COMPILE_DEFINITIONS;COMPILE_OPTIONS;LINK_OPTIONS" ${ARGN} + ) - - IF (KOKKOS_HAS_TRILINOS) - #TODO: we need to set a bunch of cache variables here - ELSEIF (TPL_INTERFACE) - ADD_LIBRARY(${NAME} INTERFACE) + if(TPL_INTERFACE) + add_library(${NAME} INTERFACE) #Give this an importy-looking name - ADD_LIBRARY(Kokkos::${NAME} ALIAS ${NAME}) - IF (TPL_LIBRARY) - MESSAGE(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") - ENDIF() + add_library(Kokkos::${NAME} ALIAS ${NAME}) + if(TPL_LIBRARY) + message(SEND_ERROR "TPL Interface library ${NAME} should not have an IMPORTED_LOCATION") + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) - ENDIF() - IF(TPL_INCLUDES) - TARGET_INCLUDE_DIRECTORIES(${NAME} INTERFACE ${TPL_INCLUDES}) - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - TARGET_COMPILE_DEFINITIONS(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) - ENDIF() - IF(TPL_COMPILE_OPTIONS) - TARGET_COMPILE_OPTIONS(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) - ENDIF() - IF(TPL_LINK_OPTIONS) - TARGET_LINK_LIBRARIES(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) - ENDIF() - ELSE() - ADD_LIBRARY(${NAME} UNKNOWN IMPORTED) - IF(TPL_LIBRARY) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - IMPORTED_LOCATION ${TPL_LIBRARY}) - ENDIF() + if(TPL_LINK_LIBRARIES) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_LIBRARIES}) + endif() + if(TPL_INCLUDES) + target_include_directories(${NAME} INTERFACE ${TPL_INCLUDES}) + endif() + if(TPL_COMPILE_DEFINITIONS) + target_compile_definitions(${NAME} INTERFACE ${TPL_COMPILE_DEFINITIONS}) + endif() + if(TPL_COMPILE_OPTIONS) + target_compile_options(${NAME} INTERFACE ${TPL_COMPILE_OPTIONS}) + endif() + if(TPL_LINK_OPTIONS) + target_link_libraries(${NAME} INTERFACE ${TPL_LINK_OPTIONS}) + endif() + else() + add_library(${NAME} UNKNOWN IMPORTED) + if(TPL_LIBRARY) + set_target_properties(${NAME} PROPERTIES IMPORTED_LOCATION ${TPL_LIBRARY}) + endif() #Things have to go in quoted in case we have multiple list entries - IF(TPL_LINK_LIBRARIES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") - ENDIF() - IF(TPL_INCLUDES) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") - ENDIF() - IF(TPL_COMPILE_DEFINITIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") - ENDIF() - IF(TPL_COMPILE_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") - ENDIF() - IF(TPL_LINK_OPTIONS) - SET_TARGET_PROPERTIES(${NAME} PROPERTIES - INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") - ENDIF() - ENDIF() -ENDMACRO() + if(TPL_LINK_LIBRARIES) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_LIBRARIES}") + endif() + if(TPL_INCLUDES) + set_target_properties(${NAME} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TPL_INCLUDES}") + endif() + if(TPL_COMPILE_DEFINITIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_DEFINITIONS "${TPL_COMPILE_DEFINITIONS}") + endif() + if(TPL_COMPILE_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_COMPILE_OPTIONS "${TPL_COMPILE_OPTIONS}") + endif() + if(TPL_LINK_OPTIONS) + set_target_properties(${NAME} PROPERTIES INTERFACE_LINK_LIBRARIES "${TPL_LINK_OPTIONS}") + endif() + endif() +endmacro() # # @MACRO: KOKKOS_FIND_HEADER @@ -479,37 +430,32 @@ ENDMACRO() # # Custom paths to search for the header # -MACRO(kokkos_find_header VAR_NAME HEADER TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS" - ${ARGN}) +macro(kokkos_find_header VAR_NAME HEADER TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS" ${ARGN}) - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_PATH(${VAR_NAME} ${HEADER} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_path( + ${VAR_NAME} ${HEADER} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} PATH_SUFFIXES include - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_PATH(${VAR_NAME} ${HEADER}) - ENDIF() + find_path(${VAR_NAME} ${HEADER}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_LIBRARY @@ -565,42 +511,36 @@ ENDMACRO() # Suffixes appended to PATHS when attempting to locate # the library. Defaults to {lib, lib64}. # -MACRO(kokkos_find_library VAR_NAME LIB TPL_NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "ALLOW_SYSTEM_PATH_FALLBACK" - "" - "PATHS;SUFFIXES" - ${ARGN}) +macro(kokkos_find_library VAR_NAME LIB TPL_NAME) + cmake_parse_arguments(TPL "ALLOW_SYSTEM_PATH_FALLBACK" "" "PATHS;SUFFIXES" ${ARGN}) - IF(NOT TPL_SUFFIXES) - SET(TPL_SUFFIXES lib lib64) - ENDIF() + if(NOT TPL_SUFFIXES) + set(TPL_SUFFIXES lib lib64) + endif() - SET(${VAR_NAME} "${VARNAME}-NOTFOUND") - SET(HAVE_CUSTOM_PATHS FALSE) + set(${VAR_NAME} "${VARNAME}-NOTFOUND") + set(HAVE_CUSTOM_PATHS FALSE) - IF(DEFINED ${TPL_NAME}_ROOT OR - DEFINED ENV{${TPL_NAME}_ROOT} OR - DEFINED KOKKOS_${TPL_NAME}_DIR OR - TPL_PATHS) - FIND_LIBRARY(${VAR_NAME} ${LIB} - PATHS - ${${TPL_NAME}_ROOT} - $ENV{${TPL_NAME}_ROOT} - ${KOKKOS_${TPL_NAME}_DIR} - ${TPL_PATHS} - PATH_SUFFIXES - ${TPL_SUFFIXES} - NO_DEFAULT_PATH) - SET(HAVE_CUSTOM_PATHS TRUE) - ENDIF() + if(DEFINED ${TPL_NAME}_ROOT + OR DEFINED ENV{${TPL_NAME}_ROOT} + OR DEFINED KOKKOS_${TPL_NAME}_DIR + OR TPL_PATHS + ) + find_library( + ${VAR_NAME} ${LIB} + PATHS ${${TPL_NAME}_ROOT} $ENV{${TPL_NAME}_ROOT} ${KOKKOS_${TPL_NAME}_DIR} ${TPL_PATHS} + PATH_SUFFIXES ${TPL_SUFFIXES} + NO_DEFAULT_PATH + ) + set(HAVE_CUSTOM_PATHS TRUE) + endif() - IF(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) + if(NOT HAVE_CUSTOM_PATHS OR TPL_ALLOW_SYSTEM_PATH_FALLBACK) #No-op if ${VAR_NAME} set by previous call - FIND_LIBRARY(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) - ENDIF() + find_library(${VAR_NAME} ${LIB} PATH_SUFFIXES ${TPL_SUFFIXES}) + endif() -ENDMACRO() +endmacro() # # @MACRO: KOKKOS_FIND_IMPORTED @@ -683,111 +623,127 @@ ENDMACRO() # If specified, this gives a list of paths to search for the headers # If not given, _ROOT/include and _ROOT/include will be searched. # -MACRO(kokkos_find_imported NAME) - CMAKE_PARSE_ARGUMENTS(TPL - "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" - "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" - "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" - ${ARGN}) +macro(kokkos_find_imported NAME) + cmake_parse_arguments( + TPL "INTERFACE;ALLOW_SYSTEM_PATH_FALLBACK" "IMPORTED_NAME;MODULE_NAME;LIBRARY;HEADER" + "LIBRARIES;LIBRARY_PATHS;LIBRARY_SUFFIXES;HEADERS;HEADER_PATHS" ${ARGN} + ) - IF(NOT TPL_MODULE_NAME) - SET(TPL_MODULE_NAME TPL${NAME}) - ENDIF() + if(NOT TPL_MODULE_NAME) + set(TPL_MODULE_NAME TPL${NAME}) + endif() - IF (TPL_ALLOW_SYSTEM_PATH_FALLBACK) - SET(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) - ELSE() - SET(ALLOW_PATH_FALLBACK_OPT) - ENDIF() + if(TPL_ALLOW_SYSTEM_PATH_FALLBACK) + set(ALLOW_PATH_FALLBACK_OPT ALLOW_SYSTEM_PATH_FALLBACK) + else() + set(ALLOW_PATH_FALLBACK_OPT) + endif() - IF (NOT TPL_IMPORTED_NAME) - IF (TPL_INTERFACE) - SET(TPL_IMPORTED_NAME ${NAME}) - ELSE() - SET(TPL_IMPORTED_NAME Kokkos::${NAME}) - ENDIF() - ENDIF() + if(NOT TPL_IMPORTED_NAME) + if(TPL_INTERFACE) + set(TPL_IMPORTED_NAME ${NAME}) + else() + set(TPL_IMPORTED_NAME Kokkos::${NAME}) + endif() + endif() - IF (NOT TPL_LIBRARY_SUFFIXES) - SET(TPL_LIBRARY_SUFFIXES lib) - IF(KOKKOS_IMPL_32BIT) - LIST(APPEND TPL_LIBRARY_SUFFIXES lib32) - ELSE() - LIST(APPEND TPL_LIBRARY_SUFFIXES lib64) - ENDIF() - ENDIF() + if(NOT TPL_LIBRARY_SUFFIXES) + set(TPL_LIBRARY_SUFFIXES lib) + if(KOKKOS_IMPL_32BIT) + list(APPEND TPL_LIBRARY_SUFFIXES lib32) + else() + list(APPEND TPL_LIBRARY_SUFFIXES lib64) + endif() + endif() - SET(${NAME}_INCLUDE_DIRS) - IF (TPL_HEADER) - KOKKOS_FIND_HEADER(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - ENDIF() + set(${NAME}_INCLUDE_DIRS) + if(TPL_HEADER) + kokkos_find_header(${NAME}_INCLUDE_DIRS ${TPL_HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + endif() - FOREACH(HEADER ${TPL_HEADERS}) - KOKKOS_FIND_HEADER(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) - IF(HEADER_FIND_TEMP) - LIST(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) - ENDIF() - ENDFOREACH() + foreach(HEADER ${TPL_HEADERS}) + kokkos_find_header(HEADER_FIND_TEMP ${HEADER} ${NAME} ${ALLOW_PATH_FALLBACK_OPT} PATHS ${TPL_HEADER_PATHS}) + if(HEADER_FIND_TEMP) + list(APPEND ${NAME}_INCLUDE_DIRS ${HEADER_FIND_TEMP}) + endif() + endforeach() - SET(${NAME}_LIBRARY) - IF(TPL_LIBRARY) - KOKKOS_FIND_LIBRARY(${NAME}_LIBRARY ${TPL_LIBRARY} ${NAME} + set(${NAME}_LIBRARY) + if(TPL_LIBRARY) + kokkos_find_library( + ${NAME}_LIBRARY + ${TPL_LIBRARY} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - ENDIF() + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + endif() - SET(${NAME}_FOUND_LIBRARIES) - FOREACH(LIB ${TPL_LIBRARIES}) - KOKKOS_FIND_LIBRARY(${LIB}_LOCATION ${LIB} ${NAME} + set(${NAME}_FOUND_LIBRARIES) + foreach(LIB ${TPL_LIBRARIES}) + kokkos_find_library( + ${LIB}_LOCATION + ${LIB} + ${NAME} ${ALLOW_PATH_FALLBACK_OPT} - PATHS ${TPL_LIBRARY_PATHS} - SUFFIXES ${TPL_LIBRARY_SUFFIXES}) - IF(${LIB}_LOCATION) - LIST(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - ELSE() - SET(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) - BREAK() - ENDIF() - ENDFOREACH() + PATHS + ${TPL_LIBRARY_PATHS} + SUFFIXES + ${TPL_LIBRARY_SUFFIXES} + ) + if(${LIB}_LOCATION) + list(APPEND ${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + else() + set(${NAME}_FOUND_LIBRARIES ${${LIB}_LOCATION}) + break() + endif() + endforeach() - INCLUDE(FindPackageHandleStandardArgs) + include(FindPackageHandleStandardArgs) #Collect all the variables we need to be valid for #find_package to have succeeded - SET(TPL_VARS_NEEDED) - IF (TPL_LIBRARY) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) - ENDIF() - IF(TPL_HEADER) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) - ENDIF() - IF(TPL_LIBRARIES) - LIST(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) - ENDIF() - FIND_PACKAGE_HANDLE_STANDARD_ARGS(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) + set(TPL_VARS_NEEDED) + if(TPL_LIBRARY) + list(APPEND TPL_VARS_NEEDED ${NAME}_LIBRARY) + endif() + if(TPL_HEADER) + list(APPEND TPL_VARS_NEEDED ${NAME}_INCLUDE_DIRS) + endif() + if(TPL_LIBRARIES) + list(APPEND TPL_VARS_NEEDED ${NAME}_FOUND_LIBRARIES) + endif() + find_package_handle_standard_args(${TPL_MODULE_NAME} REQUIRED_VARS ${TPL_VARS_NEEDED}) - MARK_AS_ADVANCED(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) + mark_as_advanced(${NAME}_INCLUDE_DIRS ${NAME}_FOUND_LIBRARIES ${NAME}_LIBRARY) #this is so much fun on a Cray system #/usr/include should never be added as a -isystem include #this freaks out the compiler include search order - IF (KOKKOS_IS_CRAYPE) - LIST(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") - ENDIF() + if(KOKKOS_IS_CRAYPE) + list(REMOVE_ITEM ${NAME}_INCLUDE_DIRS "/usr/include") + endif() - IF (${TPL_MODULE_NAME}_FOUND) - SET(IMPORT_TYPE) - IF (TPL_INTERFACE) - SET(IMPORT_TYPE "INTERFACE") - SET(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) - ENDIF() - KOKKOS_CREATE_IMPORTED_TPL(${TPL_IMPORTED_NAME} + if(${TPL_MODULE_NAME}_FOUND) + set(IMPORT_TYPE) + if(TPL_INTERFACE) + set(IMPORT_TYPE "INTERFACE") + set(${NAME}_FOUND_LIBRARIES ${TPL_LIBRARIES}) + endif() + kokkos_create_imported_tpl( + ${TPL_IMPORTED_NAME} ${IMPORT_TYPE} - INCLUDES "${${NAME}_INCLUDE_DIRS}" - LIBRARY "${${NAME}_LIBRARY}" - LINK_LIBRARIES "${${NAME}_FOUND_LIBRARIES}") - ENDIF() -ENDMACRO(kokkos_find_imported) + INCLUDES + "${${NAME}_INCLUDE_DIRS}" + LIBRARY + "${${NAME}_LIBRARY}" + LINK_LIBRARIES + "${${NAME}_FOUND_LIBRARIES}" + ) + endif() +endmacro(kokkos_find_imported) # # @MACRO: KOKKOS_LINK_TPL() @@ -817,109 +773,114 @@ ENDMACRO(kokkos_find_imported) # If specified, this gives the exact name of the target to link against # target_link_libraries( ) # -FUNCTION(kokkos_link_tpl TARGET) - CMAKE_PARSE_ARGUMENTS(TPL - "PUBLIC;PRIVATE;INTERFACE" - "IMPORTED_NAME" - "" - ${ARGN}) +function(kokkos_link_tpl TARGET) + cmake_parse_arguments(TPL "PUBLIC;PRIVATE;INTERFACE" "IMPORTED_NAME" "" ${ARGN}) #the name of the TPL - SET(TPL ${TPL_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - #Do nothing, they will have already been linked - ELSE() - IF (NOT TPL_IMPORTED_NAME) - SET(TPL_IMPORTED_NAME Kokkos::${TPL}) - ENDIF() - IF (KOKKOS_ENABLE_${TPL}) - IF (TPL_PUBLIC) - TARGET_LINK_LIBRARIES(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_PRIVATE) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) - ELSEIF (TPL_INTERFACE) - TARGET_LINK_LIBRARIES(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) - ELSE() - TARGET_LINK_LIBRARIES(${TARGET} ${TPL_IMPORTED_NAME}) - ENDIF() - ENDIF() - ENDIF() -ENDFUNCTION() + set(TPL ${TPL_UNPARSED_ARGUMENTS}) + if(NOT TPL_IMPORTED_NAME) + set(TPL_IMPORTED_NAME Kokkos::${TPL}) + endif() + if(KOKKOS_ENABLE_${TPL}) + if(TPL_PUBLIC) + target_link_libraries(${TARGET} PUBLIC ${TPL_IMPORTED_NAME}) + elseif(TPL_PRIVATE) + target_link_libraries(${TARGET} PRIVATE ${TPL_IMPORTED_NAME}) + elseif(TPL_INTERFACE) + target_link_libraries(${TARGET} INTERFACE ${TPL_IMPORTED_NAME}) + else() + target_link_libraries(${TARGET} ${TPL_IMPORTED_NAME}) + endif() + endif() +endfunction() -FUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) - SET(COMPILERS NVIDIA NVHPC DEFAULT Cray Intel Clang AppleClang IntelLLVM GNU HIPCC Fujitsu MSVC) - CMAKE_PARSE_ARGUMENTS( - PARSE - "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" - "COMPILER_ID" - "${COMPILERS}" - ${ARGN}) - IF(PARSE_UNPARSED_ARGUMENTS) - MESSAGE(SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options") - ENDIF() +function(COMPILER_SPECIFIC_OPTIONS_HELPER) + set(COMPILERS + NVIDIA + NVHPC + DEFAULT + Cray + Intel + Clang + AppleClang + IntelLLVM + GNU + HIPCC + Fujitsu + MSVC + CrayClang + ) + cmake_parse_arguments( + PARSE "LINK_OPTIONS;COMPILE_OPTIONS;COMPILE_DEFINITIONS;LINK_LIBRARIES" "COMPILER_ID" "${COMPILERS}" ${ARGN} + ) + if(PARSE_UNPARSED_ARGUMENTS) + message( + SEND_ERROR "'${PARSE_UNPARSED_ARGUMENTS}' argument(s) not recognized when providing compiler specific options" + ) + endif() - IF(PARSE_COMPILER_ID) - SET(COMPILER ${${PARSE_COMPILER_ID}}) - ELSE() - SET(COMPILER ${KOKKOS_CXX_COMPILER_ID}) - ENDIF() + if(PARSE_COMPILER_ID) + set(COMPILER ${${PARSE_COMPILER_ID}}) + else() + set(COMPILER ${KOKKOS_CXX_COMPILER_ID}) + endif() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) - FOREACH(COMP ${COMPILERS}) - IF (COMPILER STREQUAL "${COMP}") - IF (PARSE_${COMPILER}) - IF ("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") - SET(COMPILER_SPECIFIC_FLAGS_TMP "") - ELSE() - SET(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) - ENDIF() - ENDIF() - ENDIF() - ENDFOREACH() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_DEFAULT}) + foreach(COMP ${COMPILERS}) + if(COMPILER STREQUAL "${COMP}") + if(PARSE_${COMPILER}) + if("${PARSE_${COMPILER}}" STREQUAL "NO-VALUE-SPECIFIED") + set(COMPILER_SPECIFIC_FLAGS_TMP "") + else() + set(COMPILER_SPECIFIC_FLAGS_TMP ${PARSE_${COMPILER}}) + endif() + endif() + endif() + endforeach() - IF (PARSE_COMPILE_OPTIONS) + if(PARSE_COMPILE_OPTIONS) # The funky logic here is for future handling of argument deduplication # If we naively pass multiple -Xcompiler flags to target_compile_options # -Xcompiler will get deduplicated and break the build - IF ("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) - LIST(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") - GLOBAL_APPEND(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ELSE() - GLOBAL_APPEND(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() - ENDIF() + if("-Xcompiler" IN_LIST COMPILER_SPECIFIC_FLAGS_TMP) + list(REMOVE_ITEM COMPILER_SPECIFIC_FLAGS_TMP "-Xcompiler") + global_append(KOKKOS_XCOMPILER_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + else() + global_append(KOKKOS_COMPILE_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() + endif() - IF (PARSE_LINK_OPTIONS) - GLOBAL_APPEND(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() + if(PARSE_LINK_OPTIONS) + global_append(KOKKOS_LINK_OPTIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() - IF (PARSE_COMPILE_DEFINITIONS) - GLOBAL_APPEND(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() + if(PARSE_COMPILE_DEFINITIONS) + global_append(KOKKOS_COMPILE_DEFINITIONS ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() - IF (PARSE_LINK_LIBRARIES) - GLOBAL_APPEND(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) - ENDIF() -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS_HELPER) + if(PARSE_LINK_LIBRARIES) + global_append(KOKKOS_LINK_LIBRARIES ${COMPILER_SPECIFIC_FLAGS_TMP}) + endif() +endfunction(COMPILER_SPECIFIC_OPTIONS_HELPER) -FUNCTION(COMPILER_SPECIFIC_FLAGS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_FLAGS) +function(COMPILER_SPECIFIC_FLAGS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_FLAGS) -FUNCTION(COMPILER_SPECIFIC_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_OPTIONS) +function(COMPILER_SPECIFIC_OPTIONS) + compiler_specific_options_helper(${ARGN} COMPILE_OPTIONS) +endfunction(COMPILER_SPECIFIC_OPTIONS) -FUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_OPTIONS) -ENDFUNCTION(COMPILER_SPECIFIC_LINK_OPTIONS) +function(COMPILER_SPECIFIC_LINK_OPTIONS) + compiler_specific_options_helper(${ARGN} LINK_OPTIONS) +endfunction(COMPILER_SPECIFIC_LINK_OPTIONS) -FUNCTION(COMPILER_SPECIFIC_DEFS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} COMPILE_DEFINITIONS) -ENDFUNCTION(COMPILER_SPECIFIC_DEFS) +function(COMPILER_SPECIFIC_DEFS) + compiler_specific_options_helper(${ARGN} COMPILE_DEFINITIONS) +endfunction(COMPILER_SPECIFIC_DEFS) -FUNCTION(COMPILER_SPECIFIC_LIBS) - COMPILER_SPECIFIC_OPTIONS_HELPER(${ARGN} LINK_LIBRARIES) -ENDFUNCTION(COMPILER_SPECIFIC_LIBS) +function(COMPILER_SPECIFIC_LIBS) + compiler_specific_options_helper(${ARGN} LINK_LIBRARIES) +endfunction(COMPILER_SPECIFIC_LIBS) # Given a list of the form # key1;value1;key2;value2,... # Create a list of all keys in a variable named ${KEY_LIST_NAME} @@ -927,41 +888,42 @@ ENDFUNCTION(COMPILER_SPECIFIC_LIBS) # kokkos_key_value_map(ARCH ALL_ARCHES key1;value1;key2;value2) # would produce a list variable ALL_ARCHES=key1;key2 # and individual variables ARCHkey1=value1 and ARCHkey2=value2 -MACRO(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) - SET(PARSE_KEY ON) - SET(${KEY_LIST_NAME}) - FOREACH(ENTRY ${ARGN}) - IF(PARSE_KEY) - SET(CURRENT_KEY ${ENTRY}) - SET(PARSE_KEY OFF) - LIST(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) - ELSE() - SET(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) - SET(PARSE_KEY ON) - ENDIF() - ENDFOREACH() -ENDMACRO() +macro(KOKKOS_KEY_VALUE_MAP VAR_PREFIX KEY_LIST_NAME) + set(PARSE_KEY ON) + set(${KEY_LIST_NAME}) + foreach(ENTRY ${ARGN}) + if(PARSE_KEY) + set(CURRENT_KEY ${ENTRY}) + set(PARSE_KEY OFF) + list(APPEND ${KEY_LIST_NAME} ${CURRENT_KEY}) + else() + set(${VAR_PREFIX}${CURRENT_KEY} ${ENTRY}) + set(PARSE_KEY ON) + endif() + endforeach() +endmacro() -FUNCTION(KOKKOS_CHECK_DEPRECATED_OPTIONS) - KOKKOS_KEY_VALUE_MAP(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) - FOREACH(OPTION_SUFFIX ${DEPRECATED_LIST}) - SET(OPTION_NAME Kokkos_${OPTION_SUFFIX}) - SET(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) - IF(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off - MESSAGE(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") - ENDIF() - ENDFOREACH() -ENDFUNCTION() +function(KOKKOS_CHECK_DEPRECATED_OPTIONS) + kokkos_key_value_map(DEPRECATED_MSG_ DEPRECATED_LIST ${ARGN}) + foreach(OPTION_SUFFIX ${DEPRECATED_LIST}) + set(OPTION_NAME Kokkos_${OPTION_SUFFIX}) + set(OPTION_MESSAGE ${DEPRECATED_MSG_${OPTION_SUFFIX}}) + if(DEFINED ${OPTION_NAME}) # This variable has been given by the user as on or off + message(SEND_ERROR "Removed option ${OPTION_NAME} has been given with value ${${OPTION_NAME}}. ${OPT_MESSAGE}") + endif() + endforeach() +endfunction() # this function checks whether the current CXX compiler supports building CUDA -FUNCTION(kokkos_cxx_compiler_cuda_test _VAR) - # don't run this test every time - IF(DEFINED ${_VAR}) - RETURN() - ENDIF() +function(kokkos_cxx_compiler_cuda_test _VAR) + # don't run this test every time + if(DEFINED ${_VAR}) + return() + endif() - FILE(WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp -" + file( + WRITE ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp + " #include #include @@ -985,14 +947,13 @@ int main() cudaDeviceSynchronize(); return EXIT_SUCCESS; } -") +" + ) - TRY_COMPILE(_RET - ${PROJECT_BINARY_DIR}/compile_tests - SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) + try_compile(_RET ${PROJECT_BINARY_DIR}/compile_tests SOURCES ${PROJECT_BINARY_DIR}/compile_tests/compiles_cuda.cpp) - SET(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") -ENDFUNCTION() + set(${_VAR} ${_RET} CACHE STRING "CXX compiler supports building CUDA") +endfunction() # this function is provided to easily select which files use nvcc_wrapper: # @@ -1005,58 +966,77 @@ ENDFUNCTION() # NOTE: this is VERY DIFFERENT than the version in KokkosConfigCommon.cmake.in. # This version explicitly uses nvcc_wrapper. # -FUNCTION(kokkos_compilation) - # check whether the compiler already supports building CUDA - KOKKOS_CXX_COMPILER_CUDA_TEST(Kokkos_CXX_COMPILER_COMPILES_CUDA) - # if CUDA compile test has already been performed, just return - IF(Kokkos_CXX_COMPILER_COMPILES_CUDA) - RETURN() - ENDIF() +function(kokkos_compilation) + # check whether the compiler already supports building CUDA + kokkos_cxx_compiler_cuda_test(Kokkos_CXX_COMPILER_COMPILES_CUDA) + # if CUDA compile test has already been performed, just return + if(Kokkos_CXX_COMPILER_COMPILES_CUDA) + return() + endif() - CMAKE_PARSE_ARGUMENTS(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) + cmake_parse_arguments(COMP "GLOBAL;PROJECT" "" "DIRECTORY;TARGET;SOURCE" ${ARGN}) - # find kokkos_launch_compiler - FIND_PROGRAM(Kokkos_COMPILE_LAUNCHER - NAMES kokkos_launch_compiler - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + # find kokkos_launch_compiler + find_program( + Kokkos_COMPILE_LAUNCHER + NAMES kokkos_launch_compiler + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'") - ENDIF() + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR + "Kokkos could not find 'kokkos_launch_compiler'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/launcher'" + ) + endif() - # find nvcc_wrapper - FIND_PROGRAM(Kokkos_NVCC_WRAPPER - NAMES nvcc_wrapper - HINTS ${PROJECT_SOURCE_DIR} - PATHS ${PROJECT_SOURCE_DIR} - PATH_SUFFIXES bin) + # find nvcc_wrapper + find_program( + Kokkos_NVCC_WRAPPER + NAMES nvcc_wrapper + HINTS ${PROJECT_SOURCE_DIR} + PATHS ${PROJECT_SOURCE_DIR} + PATH_SUFFIXES bin + ) - IF(NOT Kokkos_COMPILE_LAUNCHER) - MESSAGE(FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'") - ENDIF() + if(NOT Kokkos_COMPILE_LAUNCHER) + message( + FATAL_ERROR "Kokkos could not find 'nvcc_wrapper'. Please set '-DKokkos_COMPILE_LAUNCHER=/path/to/nvcc_wrapper'" + ) + endif() - IF(COMP_GLOBAL) - # if global, don't bother setting others - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ELSE() - FOREACH(_TYPE PROJECT DIRECTORY TARGET SOURCE) - # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) - IF("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) - LIST(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) - UNSET(COMP_${_TYPE}) - ENDIF() - # set the properties if defined - IF(COMP_${_TYPE}) - # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - SET_PROPERTY(${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}") - ENDIF() - ENDFOREACH() - ENDIF() -ENDFUNCTION() + if(COMP_GLOBAL) + # if global, don't bother setting others + set_property( + GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + GLOBAL PROPERTY RULE_LAUNCH_LINK "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + else() + foreach(_TYPE PROJECT DIRECTORY TARGET SOURCE) + # make project/subproject scoping easy, e.g. KokkosCompilation(PROJECT) after project(...) + if("${_TYPE}" STREQUAL "PROJECT" AND COMP_${_TYPE}) + list(APPEND COMP_DIRECTORY ${PROJECT_SOURCE_DIR}) + unset(COMP_${_TYPE}) + endif() + # set the properties if defined + if(COMP_${_TYPE}) + # MESSAGE(STATUS "Using nvcc_wrapper :: ${_TYPE} :: ${COMP_${_TYPE}}") + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_COMPILE + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + set_property( + ${_TYPE} ${COMP_${_TYPE}} PROPERTY RULE_LAUNCH_LINK + "${Kokkos_COMPILE_LAUNCHER} ${Kokkos_NVCC_WRAPPER} ${CMAKE_CXX_COMPILER}" + ) + endif() + endforeach() + endif() +endfunction() ## KOKKOS_CONFIG_HEADER - parse the data list which is a list of backend names ## and create output config header file...used for ## creating dynamic include files based on enabled backends @@ -1066,14 +1046,15 @@ ENDFUNCTION() ## HEADER_GUARD TEXT used with include header guard ## HEADER_PREFIX prefix used with include (i.e. fwd, decl, setup) ## DATA_LIST list of backends to include in generated file -FUNCTION(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) - SET(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") - CONFIGURE_FILE(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) - FOREACH( BACKEND_NAME ${DATA_LIST} ) - SET(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> -\@INCLUDE_NEXT_FILE\@") - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) - ENDFOREACH() - SET(INCLUDE_NEXT_FILE "" ) - CONFIGURE_FILE(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) -ENDFUNCTION() +function(KOKKOS_CONFIG_HEADER SRC_FILE TARGET_FILE HEADER_GUARD HEADER_PREFIX DATA_LIST) + set(HEADER_GUARD_TAG "${HEADER_GUARD}_HPP_") + configure_file(cmake/${SRC_FILE} ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work COPYONLY) + foreach(BACKEND_NAME ${DATA_LIST}) + set(INCLUDE_NEXT_FILE "#include <${HEADER_PREFIX}_${BACKEND_NAME}.hpp> +\@INCLUDE_NEXT_FILE\@" + ) + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work @ONLY) + endforeach() + set(INCLUDE_NEXT_FILE "") + configure_file(${PROJECT_BINARY_DIR}/temp/${TARGET_FILE}.work ${TARGET_FILE} @ONLY) +endfunction() diff --git a/lib/kokkos/cmake/kokkos_install.cmake b/lib/kokkos/cmake/kokkos_install.cmake index f818dfa244..3ae7570ffe 100644 --- a/lib/kokkos/cmake/kokkos_install.cmake +++ b/lib/kokkos/cmake/kokkos_install.cmake @@ -1,57 +1,51 @@ -INCLUDE(CMakePackageConfigHelpers) -IF (NOT KOKKOS_HAS_TRILINOS AND NOT Kokkos_INSTALL_TESTING) - INCLUDE(GNUInstallDirs) +include(CMakePackageConfigHelpers) +if(NOT Kokkos_INSTALL_TESTING) + include(GNUInstallDirs) #Set all the variables needed for KokkosConfig.cmake - GET_PROPERTY(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) - SET(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) + get_property(KOKKOS_PROP_LIBS GLOBAL PROPERTY KOKKOS_LIBRARIES_NAMES) + set(KOKKOS_LIBRARIES ${KOKKOS_PROP_LIBS}) - INCLUDE(CMakePackageConfigHelpers) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfig.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + include(CMakePackageConfigHelpers) + configure_package_config_file( + cmake/KokkosConfig.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - CONFIGURE_PACKAGE_CONFIG_FILE( - cmake/KokkosConfigCommon.cmake.in - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake) + configure_package_config_file( + cmake/KokkosConfigCommon.cmake.in "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + INSTALL_DESTINATION ${CMAKE_INSTALL_FULL_LIBDIR}/cmake + ) - WRITE_BASIC_PACKAGE_VERSION_FILE("${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) # Install the KokkosConfig*.cmake files - install(FILES - "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" - "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" - DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) + install(FILES "${Kokkos_BINARY_DIR}/KokkosConfig.cmake" "${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake" + "${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake" DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos + ) install(EXPORT KokkosTargets NAMESPACE Kokkos:: DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/Kokkos) export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${Kokkos_BINARY_DIR}/KokkosTargets.cmake) # Required to be a TriBITS-compliant external package file(MAKE_DIRECTORY ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake - ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos) - export(EXPORT KokkosTargets NAMESPACE Kokkos:: FILE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake) -ELSE() - CONFIGURE_FILE(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake KOKKOS_CONFIG_COMMON) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_CONFIG_COMMON}") - CONFIGURE_FILE(cmake/KokkosTrilinosConfig.cmake.in ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake @ONLY) - file(READ ${Kokkos_BINARY_DIR}/KokkosTrilinosConfig.cmake KOKKOS_TRILINOS_CONFIG) - file(APPEND "${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/KokkosConfig_install.cmake" "${KOKKOS_TRILINOS_CONFIG}") + file(COPY ${Kokkos_BINARY_DIR}/KokkosConfig.cmake ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake + ${Kokkos_BINARY_DIR}/KokkosConfigVersion.cmake DESTINATION ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos + ) + file(WRITE ${CMAKE_BINARY_DIR}/cmake_packages/Kokkos/KokkosTargets.cmake + "include(${Kokkos_BINARY_DIR}/KokkosTargets.cmake)" + ) +else() + configure_file(cmake/KokkosConfigCommon.cmake.in ${Kokkos_BINARY_DIR}/KokkosConfigCommon.cmake @ONLY) - WRITE_BASIC_PACKAGE_VERSION_FILE("${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" - VERSION "${Kokkos_VERSION}" - COMPATIBILITY AnyNewerVersion) + write_basic_package_version_file( + "${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake" VERSION "${Kokkos_VERSION}" COMPATIBILITY AnyNewerVersion + ) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosConfigVersion.cmake - DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/${PACKAGE_NAME}") -ENDIF() - -INSTALL(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) + DESTINATION "${${PROJECT_NAME}_INSTALL_LIB_DIR}/cmake/Kokkos" + ) +endif() +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h DESTINATION ${KOKKOS_HEADER_DIR}) diff --git a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake index ae14a10d53..0d31e6d131 100644 --- a/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_pick_cxx_std.cmake @@ -1,20 +1,28 @@ # From CMake 3.10 documentation #This can run at any time -KOKKOS_OPTION(CXX_STANDARD "" STRING "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17") +kokkos_option( + CXX_STANDARD + "" + STRING + "[[DEPRECATED - USE CMAKE_CXX_STANDARD INSTEAD]] The C++ standard for Kokkos to use: 17 or 20. If empty, this will default to CMAKE_CXX_STANDARD. If both CMAKE_CXX_STANDARD and Kokkos_CXX_STANDARD are empty, this will default to 17" +) # Set CXX standard flags -SET(KOKKOS_ENABLE_CXX17 OFF) -SET(KOKKOS_ENABLE_CXX20 OFF) -SET(KOKKOS_ENABLE_CXX23 OFF) -SET(KOKKOS_ENABLE_CXX26 OFF) -IF (KOKKOS_CXX_STANDARD) - MESSAGE(FATAL_ERROR "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead") -ENDIF() +set(KOKKOS_ENABLE_CXX17 OFF) +set(KOKKOS_ENABLE_CXX20 OFF) +set(KOKKOS_ENABLE_CXX23 OFF) +set(KOKKOS_ENABLE_CXX26 OFF) +if(KOKKOS_CXX_STANDARD) + message( + FATAL_ERROR + "Setting the variable Kokkos_CXX_STANDARD in configuration is deprecated - set CMAKE_CXX_STANDARD directly instead" + ) +endif() -IF (NOT CMAKE_CXX_STANDARD) - SET(KOKKOS_CXX_STANDARD "17") -ELSE() - SET(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) -ENDIF() -MESSAGE(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") +if(NOT CMAKE_CXX_STANDARD) + set(KOKKOS_CXX_STANDARD "17") +else() + set(KOKKOS_CXX_STANDARD ${CMAKE_CXX_STANDARD}) +endif() +message(STATUS "Setting default Kokkos CXX standard to ${KOKKOS_CXX_STANDARD}") diff --git a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake index 5b45674e05..a84e714064 100644 --- a/lib/kokkos/cmake/kokkos_test_cxx_std.cmake +++ b/lib/kokkos/cmake/kokkos_test_cxx_std.cmake @@ -1,101 +1,112 @@ -KOKKOS_CFG_DEPENDS(CXX_STD COMPILER_ID) +kokkos_cfg_depends(CXX_STD COMPILER_ID) -FUNCTION(kokkos_set_cxx_standard_feature standard) - SET(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) - SET(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) - SET(FEATURE_NAME cxx_std_${standard}) +function(kokkos_set_cxx_standard_feature standard) + set(EXTENSION_NAME CMAKE_CXX${standard}_EXTENSION_COMPILE_OPTION) + set(STANDARD_NAME CMAKE_CXX${standard}_STANDARD_COMPILE_OPTION) + set(FEATURE_NAME cxx_std_${standard}) #CMake's way of telling us that the standard (or extension) #flags are supported is the extension/standard variables - IF (NOT DEFINED CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSEIF(CMAKE_CXX_EXTENSIONS) - IF(KOKKOS_DONT_ALLOW_EXTENSIONS) - MESSAGE(FATAL_ERROR "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue") - ELSE() - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS ON) - ENDIF() - ELSE() - #For trilinos, we need to make sure downstream projects - GLOBAL_SET(KOKKOS_USE_CXX_EXTENSIONS OFF) - ENDIF() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + global_set(KOKKOS_USE_CXX_EXTENSIONS OFF) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + elseif(CMAKE_CXX_EXTENSIONS) + if(KOKKOS_DONT_ALLOW_EXTENSIONS) + message( + FATAL_ERROR + "The chosen configuration does not support CXX extensions flags: ${KOKKOS_DONT_ALLOW_EXTENSIONS}. Must set CMAKE_CXX_EXTENSIONS=OFF to continue" + ) + else() + global_set(KOKKOS_USE_CXX_EXTENSIONS ON) + endif() + endif() - IF (KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) - MESSAGE(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) - MESSAGE(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") - IF (KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang)) - IF(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) - SET(SUPPORTED_NVCC_FLAGS "-std=c++17") - ELSE() - SET(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") - ENDIF() - IF (NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) - MESSAGE(FATAL_ERROR "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help.") - ENDIF() - ENDIF() - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") + if(KOKKOS_USE_CXX_EXTENSIONS AND ${EXTENSION_NAME}) + message(STATUS "Using ${${EXTENSION_NAME}} for C++${standard} extensions as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(NOT KOKKOS_USE_CXX_EXTENSIONS AND ${STANDARD_NAME}) + message(STATUS "Using ${${STANDARD_NAME}} for C++${standard} standard as feature") + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA AND (KOKKOS_CXX_HOST_COMPILER_ID STREQUAL GNU + OR KOKKOS_CXX_HOST_COMPILER_ID STREQUAL Clang) + ) + if(${KOKKOS_CXX_COMPILER_VERSION} VERSION_LESS 12.0.0) + set(SUPPORTED_NVCC_FLAGS "-std=c++17") + else() + set(SUPPORTED_NVCC_FLAGS "-std=c++17" "-std=c++20") + endif() + if(NOT ${${STANDARD_NAME}} IN_LIST SUPPORTED_NVCC_FLAGS) + message( + FATAL_ERROR + "CMake wants to use ${${STANDARD_NAME}} which is not supported by NVCC. Using a more recent host compiler or a more recent CMake version might help." + ) + endif() + endif() + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif(CMAKE_CXX_COMPILER_ID STREQUAL "MSVC" OR "x${CMAKE_CXX_SIMULATE_ID}" STREQUAL "xMSVC") #MSVC doesn't need a command line flag, that doesn't mean it has no support - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu")) - MESSAGE(STATUS "Using no flag for C++${standard} standard as feature") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ELSE() + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE ${FEATURE_NAME}) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32) + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL "Fujitsu") + message(STATUS "Using no flag for C++${standard} standard as feature") + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + else() #nope, we can't do anything here - MESSAGE(WARNING "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command.") - GLOBAL_SET(KOKKOS_CXX_STANDARD_FEATURE "") - ENDIF() + message( + WARNING + "C++${standard} is not supported as a compiler feature. We will choose custom flags for now, but this behavior has been deprecated. Please open an issue at https://github.com/kokkos/kokkos/issues reporting that ${KOKKOS_CXX_COMPILER_ID} ${KOKKOS_CXX_COMPILER_VERSION} failed for ${KOKKOS_CXX_STANDARD}, preferably including your CMake command." + ) + global_set(KOKKOS_CXX_STANDARD_FEATURE "") + endif() - IF((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) - IF(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) - MESSAGE(FATAL_ERROR "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported") - ENDIF() - ENDIF() -ENDFUNCTION() + if((NOT WIN32) AND (NOT ("${KOKKOS_CXX_COMPILER_ID}" STREQUAL "Fujitsu"))) + if(NOT ${FEATURE_NAME} IN_LIST CMAKE_CXX_COMPILE_FEATURES) + message( + FATAL_ERROR + "Compiler ${KOKKOS_CXX_COMPILER_ID} should support ${FEATURE_NAME}, but CMake reports feature not supported" + ) + endif() + endif() +endfunction() -IF(KOKKOS_CXX_STANDARD STREQUAL "17") +if(KOKKOS_CXX_STANDARD STREQUAL "17") kokkos_set_cxx_standard_feature(17) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") - SET(KOKKOS_ENABLE_CXX17 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "20") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "1Z") + set(KOKKOS_ENABLE_CXX17 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "20") kokkos_set_cxx_standard_feature(20) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") - SET(KOKKOS_ENABLE_CXX20 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "23") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2A") + set(KOKKOS_ENABLE_CXX20 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "23") kokkos_set_cxx_standard_feature(23) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") - SET(KOKKOS_ENABLE_CXX23 ON) -ELSEIF(KOKKOS_CXX_STANDARD STREQUAL "26") + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2B") + set(KOKKOS_ENABLE_CXX23 ON) +elseif(KOKKOS_CXX_STANDARD STREQUAL "26") kokkos_set_cxx_standard_feature(26) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") - SET(KOKKOS_ENABLE_CXX26 ON) -ELSE() - MESSAGE(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") -ENDIF() + set(KOKKOS_CXX_INTERMEDIATE_STANDARD "2C") + set(KOKKOS_ENABLE_CXX26 ON) +else() + message(FATAL_ERROR "Kokkos requires C++17 or newer but requested ${KOKKOS_CXX_STANDARD}!") +endif() # Enforce that we can compile a simple C++17 program -TRY_COMPILE(CAN_COMPILE_CPP17 - ${KOKKOS_TOP_BUILD_DIR}/corner_cases - ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp - OUTPUT_VARIABLE ERROR_MESSAGE - CXX_STANDARD 17 +try_compile( + CAN_COMPILE_CPP17 ${KOKKOS_TOP_BUILD_DIR}/corner_cases ${KOKKOS_SOURCE_DIR}/cmake/compile_tests/cplusplus17.cpp + OUTPUT_VARIABLE ERROR_MESSAGE CXX_STANDARD 17 ) -if (NOT CAN_COMPILE_CPP17) - UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - MESSAGE(FATAL_ERROR "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}") -ENDIF() -UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this - +if(NOT CAN_COMPILE_CPP17) + unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this + message( + FATAL_ERROR + "C++${KOKKOS_CXX_STANDARD}-compliant compiler detected, but unable to compile C++17 or later program. Verify that ${CMAKE_CXX_COMPILER_ID}:${CMAKE_CXX_COMPILER_VERSION} is set up correctly (e.g., check that correct library headers are being used).\nFailing output:\n ${ERROR_MESSAGE}" + ) +endif() +unset(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # Enforce that extensions are turned off for nvcc_wrapper. # For compiling CUDA code using nvcc_wrapper, we will use the host compiler's @@ -105,66 +116,70 @@ UNSET(CAN_COMPILE_CPP17 CACHE) #make sure CMake always re-runs this # that we can only use host compilers for CUDA builds that use those flags. # It also means that extensions (gnu++17) can't be turned on for CUDA builds. -IF(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() -ENDIF() +if(KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message(FATAL_ERROR "NVCC doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") + endif() +endif() -IF(KOKKOS_ENABLE_CUDA) +if(KOKKOS_ENABLE_CUDA) # ENFORCE that the compiler can compile CUDA code. - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - IF(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) - MESSAGE(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") - ENDIF() - IF(NOT DEFINED CMAKE_CXX_EXTENSIONS) - SET(CMAKE_CXX_EXTENSIONS OFF) - ELSEIF(CMAKE_CXX_EXTENSIONS) - MESSAGE(FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF") - ENDIF() - ELSEIF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) - MESSAGE(FATAL_ERROR "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}") - ENDIF() -ENDIF() + if(KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + if(KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 4.0.0) + message(FATAL_ERROR "Compiling CUDA code directly with Clang requires version 4.0.0 or higher.") + endif() + if(NOT DEFINED CMAKE_CXX_EXTENSIONS) + set(CMAKE_CXX_EXTENSIONS OFF) + elseif(CMAKE_CXX_EXTENSIONS) + message( + FATAL_ERROR "Compiling CUDA code with clang doesn't support C++ extensions. Set -DCMAKE_CXX_EXTENSIONS=OFF" + ) + endif() + elseif(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVIDIA) + message( + FATAL_ERROR + "Invalid compiler for CUDA. The compiler must be nvcc_wrapper or Clang or use kokkos_launch_compiler, but compiler ID was ${KOKKOS_CXX_COMPILER_ID}" + ) + endif() +endif() -IF (NOT KOKKOS_CXX_STANDARD_FEATURE) +if(NOT KOKKOS_CXX_STANDARD_FEATURE) #we need to pick the C++ flags ourselves - UNSET(CMAKE_CXX_STANDARD) - UNSET(CMAKE_CXX_STANDARD CACHE) - IF(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/cray.cmake) + unset(CMAKE_CXX_STANDARD) + unset(CMAKE_CXX_STANDARD CACHE) + if(KOKKOS_CXX_COMPILER_ID STREQUAL Cray) + include(${KOKKOS_SRC_PATH}/cmake/cray.cmake) kokkos_set_cray_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + include(${KOKKOS_SRC_PATH}/cmake/pgi.cmake) kokkos_set_pgi_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/intel.cmake) + elseif(KOKKOS_CXX_COMPILER_ID STREQUAL Intel) + include(${KOKKOS_SRC_PATH}/cmake/intel.cmake) kokkos_set_intel_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSEIF((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) + elseif((KOKKOS_CXX_COMPILER_ID STREQUAL "MSVC") OR ((KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA") AND WIN32)) + include(${KOKKOS_SRC_PATH}/cmake/msvc.cmake) kokkos_set_msvc_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) + else() + include(${KOKKOS_SRC_PATH}/cmake/gnu.cmake) kokkos_set_gnu_flags(${KOKKOS_CXX_STANDARD} ${KOKKOS_CXX_INTERMEDIATE_STANDARD}) - ENDIF() + endif() #check that the compiler accepts the C++ standard flag - INCLUDE(CheckCXXCompilerFlag) - IF (DEFINED CXX_STD_FLAGS_ACCEPTED) - UNSET(CXX_STD_FLAGS_ACCEPTED CACHE) - ENDIF() - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) - IF (NOT CXX_STD_FLAGS_ACCEPTED) - CHECK_CXX_COMPILER_FLAG("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) - IF (NOT CXX_INT_STD_FLAGS_ACCEPTED) - MESSAGE(FATAL_ERROR "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}") - ENDIF() - SET(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) - ENDIF() - MESSAGE(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") -ENDIF() - - - - + include(CheckCXXCompilerFlag) + if(DEFINED CXX_STD_FLAGS_ACCEPTED) + unset(CXX_STD_FLAGS_ACCEPTED CACHE) + endif() + check_cxx_compiler_flag("${KOKKOS_CXX_STANDARD_FLAG}" CXX_STD_FLAGS_ACCEPTED) + if(NOT CXX_STD_FLAGS_ACCEPTED) + check_cxx_compiler_flag("${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}" CXX_INT_STD_FLAGS_ACCEPTED) + if(NOT CXX_INT_STD_FLAGS_ACCEPTED) + message( + FATAL_ERROR + "${KOKKOS_CXX_COMPILER_ID} did not accept ${KOKKOS_CXX_STANDARD_FLAG} or ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}. You likely need to reduce the level of the C++ standard from ${KOKKOS_CXX_STANDARD}" + ) + endif() + set(KOKKOS_CXX_STANDARD_FLAG ${KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG}) + endif() + message(STATUS "Compiler features not supported, but ${KOKKOS_CXX_COMPILER_ID} accepts ${KOKKOS_CXX_STANDARD_FLAG}") +endif() diff --git a/lib/kokkos/cmake/kokkos_tpls.cmake b/lib/kokkos/cmake/kokkos_tpls.cmake index cda9e0d600..f43aff4d1f 100644 --- a/lib/kokkos/cmake/kokkos_tpls.cmake +++ b/lib/kokkos/cmake/kokkos_tpls.cmake @@ -1,126 +1,120 @@ -KOKKOS_CFG_DEPENDS(TPLS OPTIONS) -KOKKOS_CFG_DEPENDS(TPLS DEVICES) -KOKKOS_CFG_DEPENDS(TPLS COMPILER_ID) +kokkos_cfg_depends(TPLS OPTIONS) +kokkos_cfg_depends(TPLS DEVICES) +kokkos_cfg_depends(TPLS COMPILER_ID) -FUNCTION(KOKKOS_TPL_OPTION PKG DEFAULT) - CMAKE_PARSE_ARGUMENTS(PARSED - "" - "TRIBITS" - "" - ${ARGN}) +function(KOKKOS_TPL_OPTION PKG DEFAULT) + cmake_parse_arguments(PARSED "" "TRIBITS" "" ${ARGN}) - IF (PARSED_TRIBITS) + if(PARSED_TRIBITS) #this is also a TPL option you can activate with Tribits - IF (NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") + if(NOT "${TPL_ENABLE_${PARSED_TRIBITS}}" STREQUAL "") #Tribits brought its own default that should take precedence - SET(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) - ENDIF() - ENDIF() + set(DEFAULT ${TPL_ENABLE_${PARSED_TRIBITS}}) + endif() + endif() - KOKKOS_ENABLE_OPTION(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") - KOKKOS_OPTION(${PKG}_DIR "" PATH "Location of ${PKG} library") - SET(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) - SET(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) + kokkos_enable_option(${PKG} ${DEFAULT} "Whether to enable the ${PKG} library") + kokkos_option(${PKG}_DIR "" PATH "Location of ${PKG} library") + set(KOKKOS_ENABLE_${PKG} ${KOKKOS_ENABLE_${PKG}} PARENT_SCOPE) + set(KOKKOS_${PKG}_DIR ${KOKKOS_${PKG}_DIR} PARENT_SCOPE) +endfunction() - IF (KOKKOS_HAS_TRILINOS - AND KOKKOS_ENABLE_${PKG} - AND NOT PARSED_TRIBITS) - #this TPL was enabled, but it is not valid to use inside of TriBITS - MESSAGE(FATAL_ERROR "Enabled TPL ${PKG} inside TriBITS build, " - "but this can only be enabled in a standalone build") - ENDIF() -ENDFUNCTION() - -KOKKOS_TPL_OPTION(HWLOC Off TRIBITS HWLOC) -KOKKOS_TPL_OPTION(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC AND NOT - KOKKOS_HAS_TRILINOS) - SET(ROCM_DEFAULT ON) -ELSE() - SET(ROCM_DEFAULT OFF) -ENDIF() -IF(KOKKOS_ENABLE_HIP AND NOT KOKKOS_HAS_TRILINOS) - SET(ROCTHRUST_DEFAULT ON) -ELSE() - SET(ROCTHRUST_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ROCM ${ROCM_DEFAULT}) -KOKKOS_TPL_OPTION(ROCTHRUST ${ROCTHRUST_DEFAULT}) - -IF(KOKKOS_ENABLE_SYCL AND NOT KOKKOS_HAS_TRILINOS) - SET(ONEDPL_DEFAULT ON) -ELSE() - SET(ONEDPL_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(ONEDPL ${ONEDPL_DEFAULT}) - -IF (WIN32) - SET(LIBDL_DEFAULT Off) -ELSE() - SET(LIBDL_DEFAULT On) -ENDIF() -KOKKOS_TPL_OPTION(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) -SET(HPX_DEFAULT ON) -ELSE() -SET(HPX_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(HPX ${HPX_DEFAULT}) - -KOKKOS_TPL_OPTION(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) - -IF(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) - SET(LIBQUADMATH_DEFAULT ON) -ELSE() - SET(LIBQUADMATH_DEFAULT OFF) -ENDIF() -KOKKOS_TPL_OPTION(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) - -#Make sure we use our local FindKokkosCuda.cmake -KOKKOS_IMPORT_TPL(HPX INTERFACE) -KOKKOS_IMPORT_TPL(CUDA INTERFACE) -KOKKOS_IMPORT_TPL(HWLOC) -KOKKOS_IMPORT_TPL(LIBDL) -IF (NOT WIN32) - KOKKOS_IMPORT_TPL(THREADS INTERFACE) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_IMPORT_TPL(ROCM INTERFACE) -ENDIF() -KOKKOS_IMPORT_TPL(ONEDPL INTERFACE) -KOKKOS_IMPORT_TPL(LIBQUADMATH) -KOKKOS_IMPORT_TPL(ROCTHRUST) - -IF (Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) - find_package(desul REQUIRED COMPONENTS atomics) - KOKKOS_EXPORT_CMAKE_TPL(desul REQUIRED COMPONENTS atomics) -ENDIF() - -if (Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) - find_package(mdspan REQUIRED) - KOKKOS_EXPORT_CMAKE_TPL(mdspan REQUIRED) +kokkos_tpl_option(HWLOC Off TRIBITS HWLOC) +kokkos_tpl_option(CUDA ${Kokkos_ENABLE_CUDA} TRIBITS CUDA) +if(KOKKOS_ENABLE_HIP AND NOT KOKKOS_CXX_COMPILER_ID STREQUAL HIPCC) + set(ROCM_DEFAULT ON) +else() + set(ROCM_DEFAULT OFF) +endif() +if(KOKKOS_ENABLE_HIP) + set(ROCTHRUST_DEFAULT ON) +else() + set(ROCTHRUST_DEFAULT OFF) +endif() +kokkos_tpl_option(ROCM ${ROCM_DEFAULT}) +kokkos_tpl_option(ROCTHRUST ${ROCTHRUST_DEFAULT}) +if(Kokkos_ENABLE_ROCTHRUST) + include(CheckCXXSourceCompiles) + check_cxx_source_compiles( + " + #include + int main() { + static_assert(_GLIBCXX_RELEASE < 9); + return 0; + } + " + Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + ) endif() -IF (Kokkos_ENABLE_OPENMP) - find_package(OpenMP REQUIRED COMPONENTS CXX) - # FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency - # so we just append the flags here instead of linking with the OpenMP target. - IF(KOKKOS_HAS_TRILINOS) - COMPILER_SPECIFIC_FLAGS(DEFAULT ${OpenMP_CXX_FLAGS}) - ELSE() - KOKKOS_EXPORT_CMAKE_TPL(OpenMP REQUIRED COMPONENTS CXX) - ENDIF() - IF(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) - GLOBAL_APPEND(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) - ENDIF() - IF(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) - GLOBAL_APPEND(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) - ENDIF() -ENDIF() +if(KOKKOS_ENABLE_SYCL) + set(ONEDPL_DEFAULT ON) +else() + set(ONEDPL_DEFAULT OFF) +endif() +kokkos_tpl_option(ONEDPL ${ONEDPL_DEFAULT}) + +if(WIN32) + set(LIBDL_DEFAULT Off) +else() + set(LIBDL_DEFAULT On) +endif() +kokkos_tpl_option(LIBDL ${LIBDL_DEFAULT} TRIBITS DLlib) + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_HPX) + set(HPX_DEFAULT ON) +else() + set(HPX_DEFAULT OFF) +endif() +kokkos_tpl_option(HPX ${HPX_DEFAULT}) + +kokkos_tpl_option(THREADS ${Kokkos_ENABLE_THREADS} TRIBITS Pthread) + +if(Trilinos_ENABLE_Kokkos AND TPL_ENABLE_quadmath) + set(LIBQUADMATH_DEFAULT ON) +else() + set(LIBQUADMATH_DEFAULT OFF) +endif() +kokkos_tpl_option(LIBQUADMATH ${LIBQUADMATH_DEFAULT} TRIBITS quadmath) + +#Make sure we use our local FindKokkosCuda.cmake +kokkos_import_tpl(HPX INTERFACE) +kokkos_import_tpl(CUDA INTERFACE) +kokkos_import_tpl(HWLOC) +kokkos_import_tpl(LIBDL) +if(NOT WIN32) + kokkos_import_tpl(THREADS INTERFACE) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_import_tpl(ROCM INTERFACE) +endif() +kokkos_import_tpl(ONEDPL INTERFACE) +kokkos_import_tpl(LIBQUADMATH) +kokkos_import_tpl(ROCTHRUST) + +if(Kokkos_ENABLE_DESUL_ATOMICS_EXTERNAL) + find_package(desul REQUIRED COMPONENTS atomics) + kokkos_export_cmake_tpl(desul REQUIRED COMPONENTS atomics) +endif() + +if(Kokkos_ENABLE_IMPL_MDSPAN AND Kokkos_ENABLE_MDSPAN_EXTERNAL) + find_package(mdspan REQUIRED) + kokkos_export_cmake_tpl(mdspan REQUIRED) +endif() + +if(Kokkos_ENABLE_OPENMP) + find_package(OpenMP 3.0 REQUIRED COMPONENTS CXX) + kokkos_export_cmake_tpl(OpenMP REQUIRED COMPONENTS CXX) + if(Kokkos_ENABLE_HIP AND KOKKOS_COMPILE_LANGUAGE STREQUAL HIP) + global_append(KOKKOS_AMDGPU_OPTIONS ${OpenMP_CXX_FLAGS}) + endif() + if(Kokkos_ENABLE_CUDA AND KOKKOS_COMPILE_LANGUAGE STREQUAL CUDA) + global_append(KOKKOS_CUDA_OPTIONS -Xcompiler ${OpenMP_CXX_FLAGS}) + endif() +endif() #Convert list to newlines (which CMake doesn't always like in cache variables) -STRING(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") +string(REPLACE ";" "\n" KOKKOS_TPL_EXPORT_TEMP "${KOKKOS_TPL_EXPORTS}") #Convert to a regular variable -UNSET(KOKKOS_TPL_EXPORTS CACHE) -SET(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) +unset(KOKKOS_TPL_EXPORTS CACHE) +set(KOKKOS_TPL_EXPORTS ${KOKKOS_TPL_EXPORT_TEMP}) diff --git a/lib/kokkos/cmake/kokkos_tribits.cmake b/lib/kokkos/cmake/kokkos_tribits.cmake index 6da543a2c8..2fda803b11 100644 --- a/lib/kokkos/cmake/kokkos_tribits.cmake +++ b/lib/kokkos/cmake/kokkos_tribits.cmake @@ -1,82 +1,47 @@ #These are tribits wrappers only ever called by Kokkos itself -INCLUDE(CMakeParseArguments) -INCLUDE(CTest) -INCLUDE(GNUInstallDirs) +include(CMakeParseArguments) +include(CTest) +include(GNUInstallDirs) -MESSAGE(STATUS "The project name is: ${PROJECT_NAME}") +message(STATUS "The project name is: ${PROJECT_NAME}") -IF(GTest_FOUND) - SET(KOKKOS_GTEST_LIB GTest::gtest) - MESSAGE(STATUS "Using gtest found in ${GTest_DIR}") -ELSE() # fallback to internal gtest - SET(KOKKOS_GTEST_LIB kokkos_gtest) - MESSAGE(STATUS "Using internal gtest for testing") -ENDIF() +if(GTest_FOUND) + set(KOKKOS_GTEST_LIB GTest::gtest) + message(STATUS "Using gtest found in ${GTest_DIR}") +else() # fallback to internal gtest + set(KOKKOS_GTEST_LIB kokkos_gtest) + message(STATUS "Using internal gtest for testing") +endif() -FUNCTION(VERIFY_EMPTY CONTEXT) +function(VERIFY_EMPTY CONTEXT) if(${ARGN}) - MESSAGE(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") + message(FATAL_ERROR "Kokkos does not support all of Tribits. Unhandled arguments in ${CONTEXT}:\n${ARGN}") endif() -ENDFUNCTION() +endfunction() -#Leave this here for now - but only do for tribits -#This breaks the standalone CMake -IF (KOKKOS_HAS_TRILINOS) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_OpenMP) - SET(${PROJECT_NAME}_ENABLE_OpenMP OFF) - ENDIF() +macro(KOKKOS_PROCESS_SUBPACKAGES) + add_subdirectory(core) + add_subdirectory(containers) + add_subdirectory(algorithms) + add_subdirectory(simd) + add_subdirectory(example) + add_subdirectory(benchmarks) +endmacro() - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_HPX) - SET(${PROJECT_NAME}_ENABLE_HPX OFF) - ENDIF() +macro(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) + kokkos_lib_type(${LIBRARY_NAME} INCTYPE) + target_include_directories(${LIBRARY_NAME} ${INCTYPE} $) - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_DEBUG) - SET(${PROJECT_NAME}_ENABLE_DEBUG OFF) - ENDIF() - - IF(NOT DEFINED ${PROJECT_NAME}_ENABLE_TESTS) - SET(${PROJECT_NAME}_ENABLE_TESTS OFF) - ENDIF() - - IF(NOT DEFINED TPL_ENABLE_Pthread) - SET(TPL_ENABLE_Pthread OFF) - ENDIF() -ENDIF() - -MACRO(KOKKOS_PROCESS_SUBPACKAGES) - ADD_SUBDIRECTORY(core) - ADD_SUBDIRECTORY(containers) - ADD_SUBDIRECTORY(algorithms) - ADD_SUBDIRECTORY(simd) - if (NOT KOKKOS_HAS_TRILINOS) - ADD_SUBDIRECTORY(example) - ADD_SUBDIRECTORY(benchmarks) - endif() -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_DEF) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_DEF() - else() - #do nothing - endif() -ENDMACRO() - -MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) - KOKKOS_LIB_TYPE(${LIBRARY_NAME} INCTYPE) - TARGET_INCLUDE_DIRECTORIES(${LIBRARY_NAME} ${INCTYPE} $) - - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT ${PROJECT_NAME} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - COMPONENT ${PACKAGE_NAME} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} COMPONENT ${PACKAGE_NAME} ) - INSTALL( + install( TARGETS ${LIBRARY_NAME} EXPORT KokkosTargets RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} @@ -84,157 +49,131 @@ MACRO(KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL LIBRARY_NAME) ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) - VERIFY_EMPTY(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) -ENDMACRO() + verify_empty(KOKKOS_ADD_LIBRARY ${PARSE_UNPARSED_ARGUMENTS}) +endmacro() -FUNCTION(KOKKOS_ADD_EXECUTABLE ROOT_NAME) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXECUTABLE(${ROOT_NAME} ${ARGN}) - else() - CMAKE_PARSE_ARGUMENTS(PARSE - "TESTONLY" - "" - "SOURCES;TESTONLYLIBS" - ${ARGN}) +function(KOKKOS_ADD_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "TESTONLY" "" "SOURCES;TESTONLYLIBS" ${ARGN}) - SET_SOURCE_FILES_PROPERTIES(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + set_source_files_properties(${PARSE_SOURCES} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - ADD_EXECUTABLE(${EXE_NAME} ${PARSE_SOURCES}) - IF (PARSE_TESTONLYLIBS) - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) - ENDIF() - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) - #All executables must link to all the kokkos targets - #This is just private linkage because exe is final - TARGET_LINK_LIBRARIES(${EXE_NAME} PRIVATE Kokkos::kokkos) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + add_executable(${EXE_NAME} ${PARSE_SOURCES}) + if(PARSE_TESTONLYLIBS) + target_link_libraries(${EXE_NAME} PRIVATE ${PARSE_TESTONLYLIBS}) endif() -ENDFUNCTION() + verify_empty(KOKKOS_ADD_EXECUTABLE ${PARSE_UNPARSED_ARGUMENTS}) + #All executables must link to all the kokkos targets + #This is just private linkage because exe is final + target_link_libraries(${EXE_NAME} PRIVATE Kokkos::kokkos) +endfunction() -FUNCTION(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES;CATEGORIES;ARGS" - ${ARGN}) - VERIFY_EMPTY(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) +function(KOKKOS_ADD_EXECUTABLE_AND_TEST ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES;CATEGORIES;ARGS" ${ARGN}) + verify_empty(KOKKOS_ADD_EXECUTABLE_AND_TEST ${PARSE_UNPARSED_ARGUMENTS}) - IF (KOKKOS_HAS_TRILINOS) - IF(DEFINED PARSE_ARGS) - STRING(REPLACE ";" " " PARSE_ARGS "${PARSE_ARGS}") - ENDIF() - TRIBITS_ADD_EXECUTABLE_AND_TEST( - ${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - NUM_MPI_PROCS 1 - COMM serial mpi - ARGS ${PARSE_ARGS} - CATEGORIES ${PARSE_CATEGORIES} - SOURCES ${PARSE_SOURCES} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${PARSE_ARGS} - ) - ELSE() - KOKKOS_ADD_TEST_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ) - IF (PARSE_ARGS) - SET(TEST_NUMBER 0) - FOREACH (ARG_STR ${PARSE_ARGS}) - # This is passed as a single string blob to match TriBITS behavior - # We need this to be turned into a list - STRING(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) - LIST(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") - MATH(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") - KOKKOS_ADD_TEST(NAME ${TEST_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ARGS ${ARG_STR_LIST} - ) - ENDFOREACH() - ELSE() - KOKKOS_ADD_TEST(NAME ${ROOT_NAME} - EXE ${ROOT_NAME} - FAIL_REGULAR_EXPRESSION " FAILED " - ) - ENDIF() - ENDIF() - # We noticed problems with -fvisibility=hidden for inline static variables - # if Kokkos was built as shared library. - IF(BUILD_SHARED_LIBS) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) - SET_PROPERTY(TARGET ${PACKAGE_NAME}_${ROOT_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) - ENDIF() -ENDFUNCTION() + kokkos_add_test_executable(${ROOT_NAME} SOURCES ${PARSE_SOURCES}) + if(PARSE_ARGS) + set(TEST_NUMBER 0) + foreach(ARG_STR ${PARSE_ARGS}) + # This is passed as a single string blob to match TriBITS behavior + # We need this to be turned into a list + string(REPLACE " " ";" ARG_STR_LIST ${ARG_STR}) + list(APPEND TEST_NAME "${ROOT_NAME}${TEST_NUMBER}") + math(EXPR TEST_NUMBER "${TEST_NUMBER} + 1") + kokkos_add_test( + NAME + ${TEST_NAME} + EXE + ${ROOT_NAME} + FAIL_REGULAR_EXPRESSION + " FAILED " + ARGS + ${ARG_STR_LIST} + ) + endforeach() + else() + kokkos_add_test(NAME ${ROOT_NAME} EXE ${ROOT_NAME} FAIL_REGULAR_EXPRESSION " FAILED ") + endif() + # We noticed problems with -fvisibility=hidden for inline static variables + # if Kokkos was built as shared library. + if(BUILD_SHARED_LIBS AND NOT ${TEST_NAME}_DISABLE) + set_property(TARGET ${EXE_NAME} PROPERTY VISIBILITY_INLINES_HIDDEN ON) + set_property(TARGET ${EXE_NAME} PROPERTY CXX_VISIBILITY_PRESET hidden) + endif() + if(NOT + (Kokkos_INSTALL_TESTING + OR Kokkos_ENABLE_SYCL + OR Kokkos_ENABLE_HPX + OR Kokkos_ENABLE_IMPL_SKIP_NO_RTTI_FLAG + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "Intel" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 2021.2.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_COMPILER_VERSION VERSION_LESS 11.3.0) + OR (KOKKOS_CXX_COMPILER_ID STREQUAL "NVIDIA" AND KOKKOS_CXX_HOST_COMPILER_ID STREQUAL "MSVC")) + ) + if(MSVC) + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "/GR-") + else() + target_compile_options(${PACKAGE_NAME}_${ROOT_NAME} PRIVATE "-fno-rtti") + endif() + endif() +endfunction() -FUNCTION(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) - SET(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) - IF (NOT TARGET ${TARGET_NAME}) - MESSAGE(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") - ENDIF() - SET_PROPERTY(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) -ENDFUNCTION() +function(KOKKOS_SET_EXE_PROPERTY ROOT_NAME) + set(TARGET_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + if(NOT TARGET ${TARGET_NAME}) + message(SEND_ERROR "No target ${TARGET_NAME} exists - cannot set target properties") + endif() + set_property(TARGET ${TARGET_NAME} PROPERTY ${ARGN}) +endfunction() -MACRO(KOKKOS_SETUP_BUILD_ENVIRONMENT) +macro(KOKKOS_SETUP_BUILD_ENVIRONMENT) # This is needed for both regular build and install tests - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_compiler_id.cmake) #set an internal option, if not already set - SET(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") - IF (Kokkos_INSTALL_TESTING) - SET(KOKKOS_ENABLE_TESTS ON) - SET(KOKKOS_ENABLE_BENCHMARKS ON) - SET(KOKKOS_ENABLE_EXAMPLES ON) + set(Kokkos_INSTALL_TESTING OFF CACHE INTERNAL "Whether to build tests and examples against installation") + if(Kokkos_INSTALL_TESTING) + set(KOKKOS_ENABLE_TESTS ON) + set(KOKKOS_ENABLE_BENCHMARKS ON) + set(KOKKOS_ENABLE_EXAMPLES ON) # This looks a little weird, but what we are doing # is to NOT build Kokkos but instead look for an # installed Kokkos - then build examples and tests # against that installed Kokkos - FIND_PACKAGE(Kokkos REQUIRED) + find_package(Kokkos REQUIRED) # Just grab the configuration from the installation - FOREACH(DEV ${Kokkos_DEVICES}) - SET(KOKKOS_ENABLE_${DEV} ON) - ENDFOREACH() - FOREACH(OPT ${Kokkos_OPTIONS}) - SET(KOKKOS_ENABLE_${OPT} ON) - ENDFOREACH() - FOREACH(TPL ${Kokkos_TPLS}) - SET(KOKKOS_ENABLE_${TPL} ON) - ENDFOREACH() - FOREACH(ARCH ${Kokkos_ARCH}) - SET(KOKKOS_ARCH_${ARCH} ON) - ENDFOREACH() - ELSE() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) - IF (NOT KOKKOS_HAS_TRILINOS) - SET(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") - ENDIF() - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) - INCLUDE(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "" - "" - "SOURCES" - ${ARGN}) - KOKKOS_ADD_EXECUTABLE(${ROOT_NAME} - SOURCES ${PARSE_SOURCES} - ${PARSE_UNPARSED_ARGUMENTS} - TESTONLYLIBS ${KOKKOS_GTEST_LIB} - ) - SET(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) -ENDMACRO() - -MACRO(KOKKOS_PACKAGE_POSTPROCESS) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_PACKAGE_POSTPROCESS() + foreach(DEV ${Kokkos_DEVICES}) + set(KOKKOS_ENABLE_${DEV} ON) + endforeach() + foreach(OPT ${Kokkos_OPTIONS}) + set(KOKKOS_ENABLE_${OPT} ON) + endforeach() + foreach(TPL ${Kokkos_TPLS}) + set(KOKKOS_ENABLE_${TPL} ON) + endforeach() + foreach(ARCH ${Kokkos_ARCH}) + set(KOKKOS_ARCH_${ARCH} ON) + endforeach() + else() + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_devices.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_enable_options.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_test_cxx_std.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_arch.cmake) + set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${Kokkos_SOURCE_DIR}/cmake/Modules/") + include(${KOKKOS_SRC_PATH}/cmake/kokkos_tpls.cmake) + include(${KOKKOS_SRC_PATH}/cmake/kokkos_corner_cases.cmake) endif() -ENDMACRO() +endmacro() + +macro(KOKKOS_ADD_TEST_EXECUTABLE ROOT_NAME) + cmake_parse_arguments(PARSE "" "" "SOURCES" ${ARGN}) + # Don't do anything if the user disabled the test + if(NOT ${PACKAGE_NAME}_${ROOT_NAME}_DISABLE) + kokkos_add_executable( + ${ROOT_NAME} SOURCES ${PARSE_SOURCES} ${PARSE_UNPARSED_ARGUMENTS} TESTONLYLIBS ${KOKKOS_GTEST_LIB} + ) + set(EXE_NAME ${PACKAGE_NAME}_${ROOT_NAME}) + endif() +endmacro() ## KOKKOS_CONFIGURE_CORE Configure/Generate header files for core content based ## on enabled backends. @@ -242,265 +181,214 @@ ENDMACRO() ## KOKKOS_SETUP is included in Kokkos_Macros.hpp and include prefix includes/defines ## KOKKOS_DECLARE is the declaration set ## KOKKOS_POST_INCLUDE is included at the end of Kokkos_Core.hpp -MACRO(KOKKOS_CONFIGURE_CORE) - MESSAGE(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" "${KOKKOS_ENABLED_DEVICES}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" "${DEVICE_SETUP_LIST}") - KOKKOS_CONFIG_HEADER( KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" "${KOKKOS_ENABLED_DEVICES}") - CONFIGURE_FILE(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) -ENDMACRO() +macro(KOKKOS_CONFIGURE_CORE) + message(STATUS "Kokkos Backends: ${KOKKOS_ENABLED_DEVICES}") + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_FwdBackend.hpp "KOKKOS_FWD" "fwd/Kokkos_Fwd" + "${KOKKOS_ENABLED_DEVICES}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_SetupBackend.hpp "KOKKOS_SETUP" "setup/Kokkos_Setup" + "${DEVICE_SETUP_LIST}" + ) + kokkos_config_header( + KokkosCore_Config_HeaderSet.in KokkosCore_Config_DeclareBackend.hpp "KOKKOS_DECLARE" "decl/Kokkos_Declare" + "${KOKKOS_ENABLED_DEVICES}" + ) + configure_file(cmake/KokkosCore_config.h.in KokkosCore_config.h @ONLY) +endmacro() ## KOKKOS_INSTALL_ADDITIONAL_FILES - instruct cmake to install files in target destination. ## Includes generated header files, scripts such as nvcc_wrapper and hpcbind, ## as well as other files provided through plugins. -MACRO(KOKKOS_INSTALL_ADDITIONAL_FILES) +macro(KOKKOS_INSTALL_ADDITIONAL_FILES) # kokkos_launch_compiler is used by Kokkos to prefix compiler commands so that they forward to original kokkos compiler # if nvcc_wrapper was not used as CMAKE_CXX_COMPILER, configure the original compiler into kokkos_launch_compiler - IF(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") - ELSE() - IF(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") - SET(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") - ENDIF() - ENDIF() + if(NOT "${CMAKE_CXX_COMPILER}" MATCHES "nvcc_wrapper") + set(NVCC_WRAPPER_DEFAULT_COMPILER "${CMAKE_CXX_COMPILER}") + else() + if(NOT "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}" STREQUAL "") + set(NVCC_WRAPPER_DEFAULT_COMPILER "$ENV{NVCC_WRAPPER_DEFAULT_COMPILER}") + endif() + endif() - CONFIGURE_FILE(${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler - ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler - @ONLY) + configure_file( + ${CMAKE_CURRENT_SOURCE_DIR}/bin/kokkos_launch_compiler ${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler @ONLY + ) - INSTALL(PROGRAMS - "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" - "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" - "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" - DESTINATION ${CMAKE_INSTALL_BINDIR}) - INSTALL(FILES - "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" + install(PROGRAMS "${CMAKE_CURRENT_SOURCE_DIR}/bin/nvcc_wrapper" "${CMAKE_CURRENT_SOURCE_DIR}/bin/hpcbind" + "${PROJECT_BINARY_DIR}/temp/kokkos_launch_compiler" DESTINATION ${CMAKE_INSTALL_BINDIR} + ) + install( + FILES "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_config.h" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_FwdBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_SetupBackend.hpp" "${CMAKE_CURRENT_BINARY_DIR}/KokkosCore_Config_DeclareBackend.hpp" - DESTINATION ${KOKKOS_HEADER_DIR}) -ENDMACRO() + DESTINATION ${KOKKOS_HEADER_DIR} + ) +endmacro() +function(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) + cmake_parse_arguments(PARSE "PLAIN_STYLE" "" "" ${ARGN}) -FUNCTION(KOKKOS_SET_LIBRARY_PROPERTIES LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "PLAIN_STYLE" - "" - "" - ${ARGN}) - - IF((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) + if((NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL "3.18")) #I can use link options #check for CXX linkage using the simple 3.18 way - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_LINK_OPTIONS}> - ) - ELSE() + target_link_options(${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_LINK_OPTIONS}>) + else() #I can use link options #just assume CXX linkage - TARGET_LINK_OPTIONS( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS} - ) - ENDIF() + target_link_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_OPTIONS}) + endif() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_OPTIONS}> + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_OPTIONS}> ) - TARGET_COMPILE_DEFINITIONS( - ${LIBRARY_NAME} PUBLIC - $<$:${KOKKOS_COMPILE_DEFINITIONS}> + target_compile_definitions( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_COMPILE_DEFINITIONS}> ) - TARGET_LINK_LIBRARIES( - ${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES} - ) + target_link_libraries(${LIBRARY_NAME} PUBLIC ${KOKKOS_LINK_LIBRARIES}) - IF (KOKKOS_ENABLE_CUDA) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> + if(KOKKOS_ENABLE_CUDA) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_CUDA_OPTIONS}> ) - SET(NODEDUP_CUDAFE_OPTIONS) - FOREACH(OPT ${KOKKOS_CUDAFE_OPTIONS}) - LIST(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> + set(NODEDUP_CUDAFE_OPTIONS) + foreach(OPT ${KOKKOS_CUDAFE_OPTIONS}) + list(APPEND NODEDUP_CUDAFE_OPTIONS -Xcudafe ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_CUDAFE_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_ENABLE_HIP) - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> + if(KOKKOS_ENABLE_HIP) + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${KOKKOS_AMDGPU_OPTIONS}> ) - ENDIF() + endif() - LIST(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) - IF (XOPT_LENGTH GREATER 1) - MESSAGE(FATAL_ERROR "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12") - ENDIF() - IF(KOKKOS_XCOMPILER_OPTIONS) - SET(NODEDUP_XCOMPILER_OPTIONS) - FOREACH(OPT ${KOKKOS_XCOMPILER_OPTIONS}) + list(LENGTH KOKKOS_XCOMPILER_OPTIONS XOPT_LENGTH) + if(XOPT_LENGTH GREATER 1) + message( + FATAL_ERROR + "CMake deduplication does not allow multiple -Xcompiler flags (${KOKKOS_XCOMPILER_OPTIONS}): will require Kokkos to upgrade to minimum 3.12" + ) + endif() + if(KOKKOS_XCOMPILER_OPTIONS) + set(NODEDUP_XCOMPILER_OPTIONS) + foreach(OPT ${KOKKOS_XCOMPILER_OPTIONS}) #I have to do this for now because we can't guarantee 3.12 support #I really should do this with the shell option - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) - LIST(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) - ENDFOREACH() - TARGET_COMPILE_OPTIONS( - ${LIBRARY_NAME} - PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> + list(APPEND NODEDUP_XCOMPILER_OPTIONS -Xcompiler) + list(APPEND NODEDUP_XCOMPILER_OPTIONS ${OPT}) + endforeach() + target_compile_options( + ${LIBRARY_NAME} PUBLIC $<$:${NODEDUP_XCOMPILER_OPTIONS}> ) - ENDIF() + endif() - IF (KOKKOS_CXX_STANDARD_FEATURE) + if(KOKKOS_CXX_STANDARD_FEATURE) #GREAT! I can do this the right way - TARGET_COMPILE_FEATURES(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) - IF (NOT KOKKOS_USE_CXX_EXTENSIONS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) - ENDIF() - ELSE() + target_compile_features(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FEATURE}) + if(NOT KOKKOS_USE_CXX_EXTENSIONS) + set_target_properties(${LIBRARY_NAME} PROPERTIES CXX_EXTENSIONS OFF) + endif() + else() #OH, well, no choice but the wrong way - TARGET_COMPILE_OPTIONS(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) - ENDIF() -ENDFUNCTION() + target_compile_options(${LIBRARY_NAME} PUBLIC ${KOKKOS_CXX_STANDARD_FLAG}) + endif() +endfunction() +function(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "STATIC;SHARED" "" "HEADERS;SOURCES" ${ARGN}) -FUNCTION(KOKKOS_INTERNAL_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "STATIC;SHARED" - "" - "HEADERS;SOURCES" - ${ARGN}) - - IF(PARSE_HEADERS) - LIST(REMOVE_DUPLICATES PARSE_HEADERS) - ENDIF() - IF(PARSE_SOURCES) - LIST(REMOVE_DUPLICATES PARSE_SOURCES) - ENDIF() - FOREACH(source ${PARSE_SOURCES}) + if(PARSE_HEADERS) + list(REMOVE_DUPLICATES PARSE_HEADERS) + endif() + if(PARSE_SOURCES) + list(REMOVE_DUPLICATES PARSE_SOURCES) + endif() + foreach(source ${PARSE_SOURCES}) set_source_files_properties(${source} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) - ENDFOREACH() + endforeach() - IF(PARSE_STATIC) - SET(LINK_TYPE STATIC) - ENDIF() + if(PARSE_STATIC) + set(LINK_TYPE STATIC) + endif() - IF(PARSE_SHARED) - SET(LINK_TYPE SHARED) - ENDIF() + if(PARSE_SHARED) + set(LINK_TYPE SHARED) + endif() # MSVC and other platforms want to have # the headers included as source files # for better dependency detection - ADD_LIBRARY( - ${LIBRARY_NAME} - ${LINK_TYPE} - ${PARSE_HEADERS} - ${PARSE_SOURCES} - ) + add_library(${LIBRARY_NAME} ${LINK_TYPE} ${PARSE_HEADERS} ${PARSE_SOURCES}) - IF(PARSE_SHARED OR BUILD_SHARED_LIBS) - SET_TARGET_PROPERTIES(${LIBRARY_NAME} PROPERTIES - VERSION ${Kokkos_VERSION} - SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} + if(PARSE_SHARED OR BUILD_SHARED_LIBS) + set_target_properties( + ${LIBRARY_NAME} PROPERTIES VERSION ${Kokkos_VERSION} SOVERSION ${Kokkos_VERSION_MAJOR}.${Kokkos_VERSION_MINOR} ) - ENDIF() + endif() - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${LIBRARY_NAME}) + kokkos_internal_add_library_install(${LIBRARY_NAME}) #In case we are building in-tree, add an alias name #that matches the install Kokkos:: name - ADD_LIBRARY(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) -ENDFUNCTION() + add_library(Kokkos::${LIBRARY_NAME} ALIAS ${LIBRARY_NAME}) +endfunction() -FUNCTION(KOKKOS_ADD_LIBRARY LIBRARY_NAME) - CMAKE_PARSE_ARGUMENTS(PARSE - "ADD_BUILD_OPTIONS" - "" - "HEADERS" - ${ARGN} - ) - IF (KOKKOS_HAS_TRILINOS) - # We do not pass headers to trilinos. They would get installed - # to the default include folder, but we want headers installed - # preserving the directory structure, e.g. impl - # If headers got installed in both locations, it breaks some - # downstream packages - TRIBITS_ADD_LIBRARY(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} - ADDED_LIB_TARGET_NAME_OUT ${LIBRARY_NAME}_TARGET_NAME ) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${${LIBRARY_NAME}_TARGET_NAME}) - ENDIF() - ELSE() - # Forward the headers, we want to know about all headers - # to make sure they appear correctly in IDEs - KOKKOS_INTERNAL_ADD_LIBRARY( - ${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) - IF (PARSE_ADD_BUILD_OPTIONS) - KOKKOS_SET_LIBRARY_PROPERTIES(${LIBRARY_NAME}) - ENDIF() - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_ADD_INTERFACE_LIBRARY NAME) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_LIBRARY(${NAME} ${ARGN}) - ELSE() - ADD_LIBRARY(${NAME} INTERFACE) - KOKKOS_INTERNAL_ADD_LIBRARY_INSTALL(${NAME}) - ENDIF() -ENDFUNCTION() - - -FUNCTION(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - FOREACH(DIR ${ARGN}) - TARGET_INCLUDE_DIRECTORIES(${TARGET} ${INCTYPE} $) - ENDFOREACH() -ENDFUNCTION() - -FUNCTION(KOKKOS_LIB_COMPILE_OPTIONS TARGET) - KOKKOS_LIB_TYPE(${TARGET} INCTYPE) - KOKKOS_TARGET_COMPILE_OPTIONS(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) -ENDFUNCTION() - -MACRO(KOKKOS_ADD_TEST_DIRECTORIES) - IF (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_TEST_DIRECTORIES(${ARGN}) - ELSE() - IF(KOKKOS_ENABLE_TESTS) - FOREACH(TEST_DIR ${ARGN}) - ADD_SUBDIRECTORY(${TEST_DIR}) - ENDFOREACH() - ENDIF() - ENDIF() -ENDMACRO() - -MACRO(KOKKOS_ADD_EXAMPLE_DIRECTORIES) - if (KOKKOS_HAS_TRILINOS) - TRIBITS_ADD_EXAMPLE_DIRECTORIES(${ARGN}) - else() - IF(KOKKOS_ENABLE_EXAMPLES) - FOREACH(EXAMPLE_DIR ${ARGN}) - ADD_SUBDIRECTORY(${EXAMPLE_DIR}) - ENDFOREACH() - ENDIF() +function(KOKKOS_ADD_LIBRARY LIBRARY_NAME) + cmake_parse_arguments(PARSE "ADD_BUILD_OPTIONS" "" "HEADERS" ${ARGN}) + # Forward the headers, we want to know about all headers + # to make sure they appear correctly in IDEs + kokkos_internal_add_library(${LIBRARY_NAME} ${PARSE_UNPARSED_ARGUMENTS} HEADERS ${PARSE_HEADERS}) + if(PARSE_ADD_BUILD_OPTIONS) + kokkos_set_library_properties(${LIBRARY_NAME}) endif() -ENDMACRO() +endfunction() -MACRO(KOKKOS_ADD_BENCHMARK_DIRECTORIES) - IF(KOKKOS_ENABLE_BENCHMARKS) - FOREACH(BENCHMARK_DIR ${ARGN}) - ADD_SUBDIRECTORY(${BENCHMARK_DIR}) - ENDFOREACH() - ENDIF() -ENDMACRO() +function(KOKKOS_ADD_INTERFACE_LIBRARY NAME) + add_library(${NAME} INTERFACE) + kokkos_internal_add_library_install(${NAME}) +endfunction() + +function(KOKKOS_LIB_INCLUDE_DIRECTORIES TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + foreach(DIR ${ARGN}) + target_include_directories(${TARGET} ${INCTYPE} $) + endforeach() +endfunction() + +function(KOKKOS_LIB_COMPILE_OPTIONS TARGET) + kokkos_lib_type(${TARGET} INCTYPE) + target_compile_options(${${PROJECT_NAME}_LIBRARY_NAME_PREFIX}${TARGET} ${INCTYPE} ${ARGN}) +endfunction() + +macro(KOKKOS_ADD_TEST_DIRECTORIES) + if(KOKKOS_ENABLE_TESTS) + foreach(TEST_DIR ${ARGN}) + add_subdirectory(${TEST_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_EXAMPLE_DIRECTORIES) + if(KOKKOS_ENABLE_EXAMPLES) + foreach(EXAMPLE_DIR ${ARGN}) + add_subdirectory(${EXAMPLE_DIR}) + endforeach() + endif() +endmacro() + +macro(KOKKOS_ADD_BENCHMARK_DIRECTORIES) + if(KOKKOS_ENABLE_BENCHMARKS) + foreach(BENCHMARK_DIR ${ARGN}) + add_subdirectory(${BENCHMARK_DIR}) + endforeach() + endif() +endmacro() diff --git a/lib/kokkos/cmake/msvc.cmake b/lib/kokkos/cmake/msvc.cmake index 85421bdbaa..1de13585c7 100644 --- a/lib/kokkos/cmake/msvc.cmake +++ b/lib/kokkos/cmake/msvc.cmake @@ -1,11 +1,9 @@ - -FUNCTION(kokkos_set_msvc_flags full_standard int_standard) - IF (CMAKE_CXX_EXTENSIONS) - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ELSE() - SET(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) - ENDIF() -ENDFUNCTION() - +function(kokkos_set_msvc_flags full_standard int_standard) + if(CMAKE_CXX_EXTENSIONS) + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + else() + set(KOKKOS_CXX_STANDARD_FLAG "" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMEDIATE_STANDARD_FLAG "" PARENT_SCOPE) + endif() +endfunction() diff --git a/lib/kokkos/cmake/pgi.cmake b/lib/kokkos/cmake/pgi.cmake index e98e849558..45f59dcd10 100644 --- a/lib/kokkos/cmake/pgi.cmake +++ b/lib/kokkos/cmake/pgi.cmake @@ -1,8 +1,6 @@ - function(kokkos_set_pgi_flags full_standard int_standard) - STRING(TOLOWER ${full_standard} FULL_LC_STANDARD) - STRING(TOLOWER ${int_standard} INT_LC_STANDARD) - SET(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) - SET(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) + string(TOLOWER ${full_standard} FULL_LC_STANDARD) + string(TOLOWER ${int_standard} INT_LC_STANDARD) + set(KOKKOS_CXX_STANDARD_FLAG "--c++${FULL_LC_STANDARD}" PARENT_SCOPE) + set(KOKKOS_CXX_INTERMDIATE_STANDARD_FLAG "--c++${INT_LC_STANDARD}" PARENT_SCOPE) endfunction() - diff --git a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake index 4e05d22534..52d8368d04 100644 --- a/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLHWLOC.cmake @@ -15,7 +15,6 @@ # ************************************************************************ # @HEADER - #----------------------------------------------------------------------------- # Hardware locality detection and control library. # @@ -26,8 +25,4 @@ # Version: 1.3 # -KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( HWLOC - REQUIRED_HEADERS hwloc.h - REQUIRED_LIBS_NAMES "hwloc" - ) - +kokkos_tpl_find_include_dirs_and_libraries(HWLOC REQUIRED_HEADERS hwloc.h REQUIRED_LIBS_NAMES "hwloc") diff --git a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake index 3d5b03805d..f51bce5d64 100644 --- a/lib/kokkos/cmake/tpls/FindTPLPthread.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLPthread.cmake @@ -15,29 +15,26 @@ # ************************************************************************ # @HEADER -SET(USE_THREADS FALSE) +set(USE_THREADS FALSE) -IF(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) +if(NOT TPL_Pthread_INCLUDE_DIRS AND NOT TPL_Pthread_LIBRARY_DIRS AND NOT TPL_Pthread_LIBRARIES) # Use CMake's Thread finder since it is a bit smarter in determining # whether pthreads is already built into the compiler and doesn't need # a library to link. - FIND_PACKAGE(Threads) + find_package(Threads) #If Threads found a copy of pthreads make sure it is one of the cases the tribits #tpl system cannot handle. - IF(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) - IF(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") - SET(USE_THREADS TRUE) - ENDIF() - ENDIF() -ENDIF() + if(Threads_FOUND AND CMAKE_USE_PTHREADS_INIT) + if(CMAKE_THREAD_LIBS_INIT STREQUAL "" OR CMAKE_THREAD_LIBS_INIT STREQUAL "-pthread") + set(USE_THREADS TRUE) + endif() + endif() +endif() -IF(USE_THREADS) - SET(TPL_Pthread_INCLUDE_DIRS "") - SET(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") - SET(TPL_Pthread_LIBRARY_DIRS "") -ELSE() - KOKKOS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( Pthread - REQUIRED_HEADERS pthread.h - REQUIRED_LIBS_NAMES pthread - ) -ENDIF() +if(USE_THREADS) + set(TPL_Pthread_INCLUDE_DIRS "") + set(TPL_Pthread_LIBRARIES "${CMAKE_THREAD_LIBS_INIT}") + set(TPL_Pthread_LIBRARY_DIRS "") +else() + kokkos_tpl_find_include_dirs_and_libraries(Pthread REQUIRED_HEADERS pthread.h REQUIRED_LIBS_NAMES pthread) +endif() diff --git a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake index 8560ec60f1..b449f45135 100644 --- a/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake +++ b/lib/kokkos/cmake/tpls/FindTPLquadmath.cmake @@ -15,7 +15,4 @@ # ************************************************************************ # @HEADER -TRIBITS_TPL_FIND_INCLUDE_DIRS_AND_LIBRARIES( quadmath - REQUIRED_HEADERS quadmath.h - REQUIRED_LIBS_NAMES quadmath -) +tribits_tpl_find_include_dirs_and_libraries(quadmath REQUIRED_HEADERS quadmath.h REQUIRED_LIBS_NAMES quadmath) diff --git a/lib/kokkos/containers/CMakeLists.txt b/lib/kokkos/containers/CMakeLists.txt index 0857d7007b..8ee8bb41a2 100644 --- a/lib/kokkos/containers/CMakeLists.txt +++ b/lib/kokkos/containers/CMakeLists.txt @@ -1,9 +1,9 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() # FIXME_OPENACC: temporarily disabled due to unimplemented features -IF(NOT KOKKOS_ENABLE_OPENACC) -KOKKOS_ADD_TEST_DIRECTORIES(unit_tests) -KOKKOS_ADD_TEST_DIRECTORIES(performance_tests) -ENDIF() +if(NOT KOKKOS_ENABLE_OPENACC) + kokkos_add_test_directories(unit_tests) + kokkos_add_test_directories(performance_tests) +endif() diff --git a/lib/kokkos/containers/performance_tests/CMakeLists.txt b/lib/kokkos/containers/performance_tests/CMakeLists.txt index e325e45e85..8d4d605b08 100644 --- a/lib/kokkos/containers/performance_tests/CMakeLists.txt +++ b/lib/kokkos/containers/performance_tests/CMakeLists.txt @@ -1,7 +1,6 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) string(TOUPPER ${Tag} DEVICE) @@ -10,14 +9,8 @@ foreach(Tag Threads;OpenMP;Cuda;HPX;HIP) if(Kokkos_ENABLE_${DEVICE}) message(STATUS "Sources Test${Tag}.cpp") - set(SOURCES - TestMain.cpp - Test${Tag}.cpp - ) + set(SOURCES TestMain.cpp Test${Tag}.cpp) - KOKKOS_ADD_EXECUTABLE_AND_TEST( - ContainersPerformanceTest_${Tag} - SOURCES ${SOURCES} - ) + kokkos_add_executable_and_test(ContainersPerformanceTest_${Tag} SOURCES ${SOURCES}) endif() endforeach() diff --git a/lib/kokkos/containers/performance_tests/TestScatterView.hpp b/lib/kokkos/containers/performance_tests/TestScatterView.hpp index a74f833b9f..953b8bff6e 100644 --- a/lib/kokkos/containers/performance_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/performance_tests/TestScatterView.hpp @@ -25,8 +25,8 @@ namespace Perf { template void test_scatter_view(int m, int n) { - Kokkos::View original_view("original_view", - n); + Kokkos::View original_view("original_view", + n); { auto scatter_view = Kokkos::Experimental::create_scatter_view< Kokkos::Experimental::ScatterSum, Duplication, Contribution>( @@ -40,8 +40,8 @@ void test_scatter_view(int m, int n) { { auto num_threads = unique_token.size(); std::cout << "num_threads " << num_threads << '\n'; - Kokkos::View - hand_coded_duplicate_view("hand_coded_duplicate", num_threads, n); + Kokkos::View hand_coded_duplicate_view( + "hand_coded_duplicate", num_threads, n); auto f2 = KOKKOS_LAMBDA(int i) { auto thread_id = unique_token.acquire(); for (int j = 0; j < 10; ++j) { diff --git a/lib/kokkos/containers/src/CMakeLists.txt b/lib/kokkos/containers/src/CMakeLists.txt index b7d85ebf11..b386fbe675 100644 --- a/lib/kokkos/containers/src/CMakeLists.txt +++ b/lib/kokkos/containers/src/CMakeLists.txt @@ -1,33 +1,27 @@ #need these here for now -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}) #----------------------------------------------------------------------------- -SET(KOKKOS_CONTAINERS_SRCS) -APPEND_GLOB(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CONTAINER_HEADERS) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -APPEND_GLOB(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +set(KOKKOS_CONTAINERS_SRCS) +append_glob(KOKKOS_CONTAINERS_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CONTAINER_HEADERS) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +append_glob(KOKKOS_CONTAINERS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) - -INSTALL ( +install( DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} - FILES_MATCHING PATTERN "*.hpp" + FILES_MATCHING + PATTERN "*.hpp" ) -KOKKOS_ADD_LIBRARY( - kokkoscontainers - SOURCES ${KOKKOS_CONTAINERS_SRCS} - HEADERS ${KOKKOS_CONTAINERS_HEADERS} -) +kokkos_add_library(kokkoscontainers SOURCES ${KOKKOS_CONTAINERS_SRCS} HEADERS ${KOKKOS_CONTAINERS_HEADERS}) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscontainers - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscontainers ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -KOKKOS_LINK_INTERNAL_LIBRARY(kokkoscontainers kokkoscore) +kokkos_link_internal_library(kokkoscontainers kokkoscore) #----------------------------------------------------------------------------- diff --git a/lib/kokkos/containers/src/Kokkos_Bitset.hpp b/lib/kokkos/containers/src/Kokkos_Bitset.hpp index f50ab0a0f7..409260f021 100644 --- a/lib/kokkos/containers/src/Kokkos_Bitset.hpp +++ b/lib/kokkos/containers/src/Kokkos_Bitset.hpp @@ -271,7 +271,7 @@ class Bitset { offset = !(scan_direction & BIT_SCAN_REVERSE) ? offset : (offset + block_mask) & block_mask; - block = Impl::rotate_right(block, offset); + block = Impl::rotate_right(block, offset); return (((!(scan_direction & BIT_SCAN_REVERSE) ? Impl::bit_scan_forward(block) : Impl::int_log2(block)) + diff --git a/lib/kokkos/containers/src/Kokkos_DualView.hpp b/lib/kokkos/containers/src/Kokkos_DualView.hpp index a37a2bdceb..6a2e6f73a1 100644 --- a/lib/kokkos/containers/src/Kokkos_DualView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DualView.hpp @@ -275,14 +275,29 @@ class DualView : public ViewTraits { const size_t n5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : modified_flags(t_modified_flags("DualView::modified_flags")), - d_view(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7) { - // without UVM, host View mirrors - if constexpr (Kokkos::Impl::has_type::value) - h_view = Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); - else - h_view = Kokkos::create_mirror_view(d_view); + : modified_flags(t_modified_flags("DualView::modified_flags")) { + if constexpr (Impl::ViewCtorProp::sequential_host_init) { + h_view = t_host(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + static_assert(Impl::ViewCtorProp::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!Impl::ViewCtorProp::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); + + d_view = Kokkos::create_mirror_view_and_copy( + typename traits::memory_space{}, h_view); + } else { + d_view = t_dev(arg_prop, n0, n1, n2, n3, n4, n5, n6, n7); + + // without UVM, host View mirrors + if constexpr (Kokkos::Impl::has_type::value) + h_view = + Kokkos::create_mirror_view(Kokkos::WithoutInitializing, d_view); + else + h_view = Kokkos::create_mirror_view(d_view); + } } //! Copy constructor (shallow copy) @@ -338,23 +353,21 @@ class DualView : public ViewTraits { // does the DualView have only one device struct impl_dualview_is_single_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; // does the given device match the device of t_dev? template struct impl_device_matches_tdev_device { - enum : bool { - value = std::is_same::value - }; + enum : bool { value = std::is_same_v }; }; // does the given device match the device of t_host? template struct impl_device_matches_thost_device { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -362,7 +375,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_thost_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -370,7 +383,7 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_exec { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -378,8 +391,8 @@ class DualView : public ViewTraits { template struct impl_device_matches_tdev_memory_space { enum : bool { - value = std::is_same::value + value = std::is_same_v }; }; @@ -389,11 +402,6 @@ class DualView : public ViewTraits { /// \brief Return a View on a specific device \c Device. /// - /// Please don't be afraid of the nested if_c expressions in the return - /// value's type. That just tells the method what the return type - /// should be: t_dev if the \c Device template parameter matches - /// this DualView's device type, else t_host. - /// /// For example, suppose you create a DualView on Cuda, like this: /// \code /// using dual_view_type = @@ -410,56 +418,47 @@ class DualView : public ViewTraits { /// typename dual_view_type::t_host hostView = DV.view (); /// \endcode template - KOKKOS_INLINE_FUNCTION const typename std::conditional_t< - impl_device_matches_tdev_device::value, t_dev, - typename std::conditional_t< - impl_device_matches_thost_device::value, t_host, - typename std::conditional_t< - impl_device_matches_thost_exec::value, t_host, - typename std::conditional_t< - impl_device_matches_tdev_exec::value, t_dev, - typename std::conditional_t< - impl_device_matches_tdev_memory_space::value, - t_dev, t_host>>>>> - view() const { - constexpr bool device_is_memspace = - std::is_same::value; - constexpr bool device_is_execspace = - std::is_same::value; - constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; - constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; - constexpr bool device_exec_is_t_host_exec = - std::is_same::value; - constexpr bool device_mem_is_t_host_mem = - std::is_same::value; - constexpr bool device_is_t_host_device = - std::is_same::value; - constexpr bool device_is_t_dev_device = - std::is_same::value; - - static_assert( - device_is_t_dev_device || device_is_t_host_device || - (device_is_memspace && - (device_mem_is_t_dev_mem || device_mem_is_t_host_mem)) || - (device_is_execspace && - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec)) || - ((!device_is_execspace && !device_is_memspace) && - ((device_mem_is_t_dev_mem || device_mem_is_t_host_mem) || - (device_exec_is_t_dev_exec || device_exec_is_t_host_exec))), - "Template parameter to .view() must exactly match one of the " - "DualView's device types or one of the execution or memory spaces"); - - return Impl::if_c::value, - t_dev, t_host>::select(d_view, h_view); + KOKKOS_FUNCTION auto view() const { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is a memory space but doesn't " + "match either of DualView's memory spaces!"); + return h_view; + } + } else { + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + return d_view; + } else { + static_assert(std::is_same_v, + "The template argument is an execution space but " + "doesn't match either of DualView's execution spaces!"); + return h_view; + } + } else { + static_assert(std::is_same_v, + "The template argument is neither a memory space, " + "execution space, or device!"); + if constexpr (std::is_same_v) + return d_view; + else { + static_assert(std::is_same_v, + "The template argument is a device but " + "doesn't match either of DualView's devices!"); + return h_view; + } + } + } +#ifdef KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif } KOKKOS_INLINE_FUNCTION @@ -475,27 +474,27 @@ class DualView : public ViewTraits { template static int get_device_side() { constexpr bool device_is_memspace = - std::is_same::value; + std::is_same_v; constexpr bool device_is_execspace = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_dev_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_dev_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_exec_is_t_host_exec = - std::is_same::value; + std::is_same_v; constexpr bool device_mem_is_t_host_mem = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_host_device = - std::is_same::value; + std::is_same_v; constexpr bool device_is_t_dev_device = - std::is_same::value; + std::is_same_v; static_assert( device_is_t_dev_device || device_is_t_host_device || @@ -627,9 +626,9 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}); } @@ -637,9 +636,9 @@ class DualView : public ViewTraits { template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (std::is_same::value) || - (std::is_same::value), + (std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::true_type{}, exec); } @@ -669,18 +668,18 @@ class DualView : public ViewTraits { template void sync(const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}); } template void sync(const ExecutionSpace& exec, const std::enable_if_t< - (!std::is_same::value) || - (std::is_same::value), + (!std::is_same_v) || + (std::is_same_v), int>& = 0) { sync_impl(std::false_type{}, exec); } @@ -943,12 +942,21 @@ class DualView : public ViewTraits { Impl::size_mismatch(h_view, h_view.rank_dynamic, new_extents); if (sizeMismatch) { - ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); - if constexpr (alloc_prop_input::initialize) { - h_view = create_mirror_view(typename t_host::memory_space(), d_view); + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + ::Kokkos::realloc(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); } else { - h_view = create_mirror_view(Kokkos::WithoutInitializing, - typename t_host::memory_space(), d_view); + ::Kokkos::realloc(arg_prop, d_view, n0, n1, n2, n3, n4, n5, n6, n7); + if constexpr (alloc_prop_input::initialize) { + h_view = create_mirror_view(typename t_host::memory_space(), d_view); + } else { + h_view = create_mirror_view(Kokkos::WithoutInitializing, + typename t_host::memory_space(), d_view); + } } } else if constexpr (alloc_prop_input::initialize) { if constexpr (alloc_prop_input::has_execution_space) { @@ -1062,9 +1070,22 @@ class DualView : public ViewTraits { } }; - constexpr bool has_execution_space = alloc_prop_input::has_execution_space; + if constexpr (alloc_prop_input::sequential_host_init) { + static_assert(alloc_prop_input::initialize, + "DualView: SequentialHostInit isn't compatible with " + "WithoutInitializing!"); + static_assert(!alloc_prop_input::has_execution_space, + "DualView: SequentialHostInit isn't compatible with " + "providing an execution space instance!"); - if constexpr (has_execution_space) { + if (sizeMismatch) { + sync(); + ::Kokkos::resize(arg_prop, h_view, n0, n1, n2, n3, n4, n5, n6, n7); + d_view = + create_mirror_view_and_copy(typename t_dev::memory_space(), h_view); + } + return; + } else if constexpr (alloc_prop_input::has_execution_space) { using ExecSpace = typename alloc_prop_input::execution_space; const auto& exec_space = Impl::get_property(arg_prop); @@ -1182,15 +1203,15 @@ class DualView : public ViewTraits { } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> extent(const iType& r) const { return d_view.extent(r); } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> extent_int(const iType& r) const { return static_cast(d_view.extent(r)); } diff --git a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp index 5f7fcaf69e..b860359526 100644 --- a/lib/kokkos/containers/src/Kokkos_DynRankView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynRankView.hpp @@ -38,6 +38,23 @@ class DynRankView; // forward declare namespace Impl { +template +struct ViewDataTypeFromRank { + using type = typename ViewDataTypeFromRank::type*; +}; + +template +struct ViewDataTypeFromRank { + using type = T; +}; + +template +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( + DynRankView v, + std::enable_if_t::specialize, + void>>* = nullptr); + template struct DynRankDimTraits { enum : size_t { unspecified = KOKKOS_INVALID_INDEX }; @@ -91,54 +108,59 @@ struct DynRankDimTraits { } // Create the layout for the rank-7 view. + // Because the underlying View is rank-7, preserve "unspecified" for + // dimension 8. + // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.dimension[7] != unspecified ? layout.dimension[7] : 1); + Layout new_layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified); + new_layout.stride = layout.stride; + return new_layout; } // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value), Layout> + (std::is_same_v), Layout> createLayout(const Layout& layout) { - return Layout(layout.dimension[0] != unspecified ? layout.dimension[0] : 1, - layout.stride[0], - layout.dimension[1] != unspecified ? layout.dimension[1] : 1, - layout.stride[1], - layout.dimension[2] != unspecified ? layout.dimension[2] : 1, - layout.stride[2], - layout.dimension[3] != unspecified ? layout.dimension[3] : 1, - layout.stride[3], - layout.dimension[4] != unspecified ? layout.dimension[4] : 1, - layout.stride[4], - layout.dimension[5] != unspecified ? layout.dimension[5] : 1, - layout.stride[5], - layout.dimension[6] != unspecified ? layout.dimension[6] : 1, - layout.stride[6], - layout.dimension[7] != unspecified ? layout.dimension[7] : 1, - layout.stride[7]); + return Layout( + layout.dimension[0] != unspecified ? layout.dimension[0] : 1, + layout.stride[0], + layout.dimension[1] != unspecified ? layout.dimension[1] : 1, + layout.stride[1], + layout.dimension[2] != unspecified ? layout.dimension[2] : 1, + layout.stride[2], + layout.dimension[3] != unspecified ? layout.dimension[3] : 1, + layout.stride[3], + layout.dimension[4] != unspecified ? layout.dimension[4] : 1, + layout.stride[4], + layout.dimension[5] != unspecified ? layout.dimension[5] : 1, + layout.stride[5], + layout.dimension[6] != unspecified ? layout.dimension[6] : 1, + layout.stride[6], + layout.dimension[7] != unspecified ? layout.dimension[7] : unspecified, + layout.stride[7]); } // Extra overload to match that for specialize types template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value || - std::is_same::value), + (std::is_same_v || + std::is_same_v || + std::is_same_v), typename Traits::array_layout> createLayout(const Kokkos::Impl::ViewCtorProp& /* prop */, const typename Traits::array_layout& layout) { @@ -164,9 +186,8 @@ struct DynRankDimTraits { // Non-strided Layout template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value || - std::is_same::value) && - std::is_integral::value, + (std::is_same_v || + std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -182,8 +203,7 @@ reconstructLayout(const Layout& layout, iType dynrank) { // LayoutStride template KOKKOS_INLINE_FUNCTION static std::enable_if_t< - (std::is_same::value) && - std::is_integral::value, + (std::is_same_v)&&std::is_integral_v, Layout> reconstructLayout(const Layout& layout, iType dynrank) { return Layout(dynrank > 0 ? layout.dimension[0] : KOKKOS_INVALID_INDEX, @@ -284,40 +304,43 @@ namespace Impl { template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && - std::is_void::value && - (std::is_same::value || - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value)))), - Kokkos::Impl::ViewToDynRankViewTag>> { + std::enable_if_t< + (std::is_same_v && + std::is_void_v && + std::is_void_v && + (std::is_same_v || + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>)))), + Kokkos::Impl::ViewToDynRankViewTag>> { private: enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { is_assignable_layout = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; public: @@ -345,7 +368,7 @@ class ViewMapping< src.layout()); // Check this for integer input1 for padding, etc dst.m_map.m_impl_handle = Kokkos::Impl::ViewDataHandle::assign( src.m_map.m_impl_handle, src.m_track.m_tracker); - dst.m_track.assign(src.m_track.m_tracker, DstTraits::is_managed); + dst.m_track.m_tracker.assign(src.m_track.m_tracker, DstTraits::is_managed); dst.m_rank = Kokkos::View::rank(); } }; @@ -378,10 +401,11 @@ struct is_dyn_rank_view> : public std::true_type { template inline constexpr bool is_dyn_rank_view_v = is_dyn_rank_view::value; +// Inherit privately from View, this way we don't import anything funky +// for example the rank member vs the rank() function of DynRankView template -class DynRankView : public ViewTraits { - static_assert(!std::is_array::value && - !std::is_pointer::value, +class DynRankView : private View { + static_assert(!std::is_array_v && !std::is_pointer_v, "Cannot template DynRankView with array or pointer datatype - " "must be pod"); @@ -391,28 +415,66 @@ class DynRankView : public ViewTraits { template friend class Kokkos::Impl::ViewMapping; + size_t m_rank{}; + public: using drvtraits = ViewTraits; using view_type = View; - using traits = ViewTraits; - private: - using map_type = - Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; - - track_type m_track; - map_type m_map; - unsigned m_rank; + using drdtraits = Impl::DynRankDimTraits; public: - KOKKOS_INLINE_FUNCTION + // typedefs from ViewTraits, overriden + using data_type = typename drvtraits::data_type; + using const_data_type = typename drvtraits::const_data_type; + using non_const_data_type = typename drvtraits::non_const_data_type; + + // typedefs from ViewTraits not overriden + using value_type = typename view_type::value_type; + using const_value_type = typename view_type::const_value_type; + using non_const_value_type = typename view_type::non_const_value_type; + using traits = typename view_type::traits; + using array_layout = typename view_type::array_layout; + + using execution_space = typename view_type::execution_space; + using memory_space = typename view_type::memory_space; + using device_type = typename view_type::device_type; + + using memory_traits = typename view_type::memory_traits; + using host_mirror_space = typename view_type::host_mirror_space; + using size_type = typename view_type::size_type; + + using reference_type = typename view_type::reference_type; + using pointer_type = typename view_type::pointer_type; + + using scalar_array_type = value_type; + using const_scalar_array_type = const_value_type; + using non_const_scalar_array_type = non_const_value_type; + using specialize = typename view_type::specialize; + + // typedefs in View for mdspan compatibility + // cause issues with MSVC+CUDA + // using layout_type = typename view_type::layout_type; + using index_type = typename view_type::index_type; + using element_type = typename view_type::element_type; + using rank_type = typename view_type::rank_type; + using reference = reference_type; + using data_handle_type = pointer_type; + + KOKKOS_FUNCTION view_type& DownCast() const { return (view_type&)(*this); } - KOKKOS_INLINE_FUNCTION + + // FIXME: this function make NO sense, the above one already is marked const + // Maybe one would want to get back a view of const?? + KOKKOS_FUNCTION const view_type& ConstDownCast() const { return (const view_type&)(*this); } + // FIXME: deprecate DownCast in favor of to_view + // KOKKOS_FUNCTION + // view_type to_view() const { return *this; } + // Types below - at least the HostMirror requires the value_type, NOT the rank // 7 data_type of the traits @@ -436,114 +498,36 @@ class DynRankView : public ViewTraits { typename drvtraits::array_layout, typename drvtraits::host_mirror_space>; + using host_mirror_type = HostMirror; //---------------------------------------- // Domain rank and extents // enum { Rank = map_type::Rank }; //Will be dyn rank of 7 always, keep the // enum? - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const { - return m_map.extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() const; - //---------------------------------------- /* Deprecate all 'dimension' functions in favor of * ISO/C++ vocabulary 'extent'. */ - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.extent(0) * m_map.extent(1) * m_map.extent(2) * - m_map.extent(3) * m_map.extent(4) * m_map.extent(5) * - m_map.extent(6) * m_map.extent(7); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return (m_map.data() != nullptr); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - //---------------------------------------- private: enum { is_layout_left = - std::is_same::value, + std::is_same_v, is_layout_right = - std::is_same::value, + std::is_same_v, - is_layout_stride = std::is_same::value, + is_layout_stride = + std::is_same_v, - is_default_map = std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride) + is_default_map = std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride), + + is_default_access = + is_default_map && std::is_same_v }; // Bounds checking macros @@ -570,476 +554,272 @@ class DynRankView : public ViewTraits { #endif public: - KOKKOS_INLINE_FUNCTION + KOKKOS_FUNCTION constexpr unsigned rank() const { return m_rank; } - // operators () - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type operator()() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); + using view_type::data; + using view_type::extent; + using view_type::extent_int; // FIXME: not tested + using view_type::impl_map; // FIXME: not tested + using view_type::is_allocated; + using view_type::label; + using view_type::size; + using view_type::span; + using view_type::span_is_contiguous; // FIXME: not tested + using view_type::stride; // FIXME: not tested + using view_type::stride_0; // FIXME: not tested + using view_type::stride_1; // FIXME: not tested + using view_type::stride_2; // FIXME: not tested + using view_type::stride_3; // FIXME: not tested + using view_type::stride_4; // FIXME: not tested + using view_type::stride_5; // FIXME: not tested + using view_type::stride_6; // FIXME: not tested + using view_type::stride_7; // FIXME: not tested + using view_type::use_count; + +#ifdef KOKKOS_ENABLE_CUDA + KOKKOS_FUNCTION reference_type + operator()(index_type i0 = 0, index_type i1 = 0, index_type i2 = 0, + index_type i3 = 0, index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); + } +#else + // Adding shortcut operators for rank-0 to rank-3 for default layouts + // and access modalities. + // This removes performance overhead for always using rank-7 mapping. + // See https://github.com/kokkos/kokkos/issues/7604 + // When boundschecking is enabled we still go through the underlying + // rank-7 View to leverage the error checks there. + + KOKKOS_FUNCTION reference_type operator()() const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 0u) + Kokkos::abort( + "DynRankView rank 0 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + return view_type::data()[0]; + } else +#endif + return view_type::operator()(0, 0, 0, 0, 0, 0, 0); } - // Rank 1 - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding...) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // Phalanx is violating this, since they use the operator to access ALL - // elements in the allocation KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( (1 , - // this->rank(), m_track, m_map) ) - return data()[i0]; + KOKKOS_FUNCTION reference_type operator()(index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 1u) + Kokkos::abort( + "DynRankView rank 1 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_stride) { + return view_type::data()[i0 * view_type::stride(0)]; + } else { + return view_type::data()[i0]; + } + } else +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - // This assumes a contiguous underlying memory (i.e. no padding, no - // striding... AND a Trilinos/Sacado scalar type ) - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - !std::is_same::value && - std::is_integral::value, - reference_type> - operator[](const iType& i0) const { - // auto map = impl_map(); - const size_t dim_scalar = m_map.dimension_scalar(); - const size_t bytes = this->span() / dim_scalar; - - using tmp_view_type = Kokkos::View< - DataType*, typename traits::array_layout, typename traits::device_type, - Kokkos::MemoryTraits>; - tmp_view_type rankone_view(this->data(), bytes, dim_scalar); - return rankone_view(i0); + KOKKOS_FUNCTION reference_type operator()(index_type i0, + index_type i1) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 2u) + Kokkos::abort( + "DynRankView rank 2 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + i1 * view_type::stride(1)]; + } else if constexpr (is_layout_right) { + return view_type::data()[i0 * view_type::extent(1) + i1]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1)]; + } + } else +#endif + return view_type::operator()(i0, i1, 0, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2) const { +#ifdef KOKKOS_ENABLE_DEBUG + // FIXME: Should be equal, only access(...) allows mismatch of rank and + // index args + if (rank() > 3u) + Kokkos::abort( + "DynRankView rank 3 operator() called with invalid number of " + "arguments."); +#endif +#ifndef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (is_default_access) { + if constexpr (is_layout_left) { + return view_type::data()[i0 + view_type::stride(1) * + (i1 + i2 * view_type::extent(1))]; + } else if constexpr (is_layout_right) { + return view_type::data()[(i0 * view_type::extent(1) + i1) * + view_type::extent(2) + + i2]; + } else { + return view_type::data()[i0 * view_type::stride(0) + + i1 * view_type::stride(1) + + i2 * view_type::stride(2)]; + } + } else +#endif + return view_type::operator()(i0, i1, i2, 0, 0, 0, 0); +#if defined KOKKOS_COMPILER_INTEL || \ + (defined(KOKKOS_COMPILER_NVCC) && KOKKOS_COMPILER_NVCC >= 1130 && \ + !defined(KOKKOS_COMPILER_MSVC)) + __builtin_unreachable(); +#endif } - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); + KOKKOS_FUNCTION reference_type operator()(index_type i0, index_type i1, + index_type i2, index_type i3, + index_type i4 = 0, + index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } +#endif - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); +// This is an accomodation for Phalanx, that is usint the operator[] to access +// all elements in a linear fashion even when the rank is not 1 +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { + if constexpr (std::is_same_v) { + return view_type::data()[i0]; + } else { + const size_t dim_scalar = view_type::impl_map().dimension_scalar(); + const size_t bytes = view_type::span() / dim_scalar; + + using tmp_view_type = + Kokkos::View>; + tmp_view_type rankone_view(view_type::data(), bytes, dim_scalar); + return rankone_view(i0); + } } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); +#else + KOKKOS_FUNCTION reference_type operator[](index_type i0) const { +#ifdef KOKKOS_ENABLE_DEBUG + if (rank() != 1u) + Kokkos::abort("DynRankView operator[] can only be used for rank-1"); +#endif + return view_type::operator()(i0, 0, 0, 0, 0, 0, 0); } +#endif - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); + KOKKOS_FUNCTION reference_type access(index_type i0 = 0, index_type i1 = 0, + index_type i2 = 0, index_type i3 = 0, + index_type i4 = 0, index_type i5 = 0, + index_type i6 = 0) const { + return view_type::operator()(i0, i1, i2, i3, i4, i5, i6); } - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - operator()(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5, - const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - // Rank 0 - KOKKOS_INLINE_FUNCTION - reference_type access() const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((0, this->rank(), m_track, m_map)) - return impl_map().reference(); - // return m_map.reference(0,0,0,0,0,0,0); - } - - // Rank 1 - // Rank 1 parenthesis - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t<(std::is_void::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType& i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((1, this->rank(), m_track, m_map, i0)) - return m_map.reference(i0, 0, 0, 0, 0, 0, 0); - } - - // Rank 2 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY((2, this->rank(), m_track, m_map, i0, i1)) - return m_map.reference(i0, i1, 0, 0, 0, 0, 0); - } - - // Rank 3 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (3, this->rank(), m_track, m_map, i0, i1, i2)) - return m_map.reference(i0, i1, i2, 0, 0, 0, 0); - } - - // Rank 4 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (4, this->rank(), m_track, m_map, i0, i1, i2, i3)) - return m_map.reference(i0, i1, i2, i3, 0, 0, 0); - } - - // Rank 5 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (5, this->rank(), m_track, m_map, i0, i1, i2, i3, i4)) - return m_map.reference(i0, i1, i2, i3, i4, 0, 0); - } - - // Rank 6 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_void::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, - const iType3& i3, const iType4& i4, const iType5& i5) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (6, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5)) - return m_map.reference(i0, i1, i2, i3, i4, i5, 0); - } - - // Rank 7 - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - (std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value && std::is_integral::value && - std::is_integral::value), - reference_type> - access(const iType0& i0, const iType1& i1, const iType2& i2, const iType3& i3, - const iType4& i4, const iType5& i5, const iType6& i6) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY( - (7, this->rank(), m_track, m_map, i0, i1, i2, i3, i4, i5, i6)) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - //---------------------------------------- // Standard constructor, destructor, and assignment operators... KOKKOS_DEFAULTED_FUNCTION ~DynRankView() = default; - KOKKOS_INLINE_FUNCTION - DynRankView() : m_track(), m_map(), m_rank() {} // Default ctor - - KOKKOS_INLINE_FUNCTION - DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView(DynRankView&& rhs) - : m_track(rhs.m_track), m_map(rhs.m_map), m_rank(rhs.m_rank) {} - - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(const DynRankView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; - } - - KOKKOS_INLINE_FUNCTION - DynRankView& operator=(DynRankView&& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_rank = rhs.m_rank; - return *this; - } + KOKKOS_DEFAULTED_FUNCTION DynRankView() = default; //---------------------------------------- // Compatible view copy constructor and assignment // may assign unmanaged from managed. + // Make this conditionally explicit? template - KOKKOS_INLINE_FUNCTION DynRankView(const DynRankView& rhs) - : m_track(rhs.m_track, traits::is_managed), m_map(), m_rank(rhs.m_rank) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - } + KOKKOS_FUNCTION DynRankView(const DynRankView& rhs) + : view_type(rhs), m_rank(rhs.m_rank) {} template - KOKKOS_INLINE_FUNCTION DynRankView& operator=( - const DynRankView& rhs) { - using SrcTraits = typename DynRankView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible DynRankView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); - m_track.assign(rhs.m_track, traits::is_managed); - m_rank = rhs.rank(); + KOKKOS_FUNCTION DynRankView& operator=(const DynRankView& rhs) { + view_type::operator=(rhs); + m_rank = rhs.m_rank; return *this; } +#if 0 // TODO: this will later be swapped in depending on whether the new View + // impl is active + private: + template + KOKKOS_FUNCTION typename view_type::extents_type create_rank7_extents( + const Ext& ext) { + return typename view_type::extents_type( + ext.rank() > 0 ? ext.extent(0) : 1, ext.rank() > 1 ? ext.extent(1) : 1, + ext.rank() > 2 ? ext.extent(2) : 1, ext.rank() > 3 ? ext.extent(3) : 1, + ext.rank() > 4 ? ext.extent(4) : 1, ext.rank() > 5 ? ext.extent(5) : 1, + ext.rank() > 6 ? ext.extent(6) : 1); + } + + public: // Copy/Assign View to DynRankView template - KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs) - : m_track(), m_map(), m_rank(View::rank()) { + KOKKOS_INLINE_FUNCTION DynRankView(const View& rhs, + size_t new_rank) + : view_type(rhs.data_handle(), drdtraits::createLayout(rhs.layout())), + m_rank(new_rank) { + if (new_rank > rhs.rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); + } + + template + KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + view_type::operator=(view_type( + rhs.data_handle(), + typename view_type::mapping_type(create_rank7_extents(rhs.extents())), + rhs.accessor())); + m_rank = rhs.rank(); + return *this; + } +#else + template + KOKKOS_FUNCTION DynRankView(const View& rhs, size_t new_rank) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping; static_assert(Mapping::is_assignable, - "Incompatible View to DynRankView copy construction"); + "Incompatible View to DynRankView copy assignment"); + if (new_rank > View::rank()) + Kokkos::abort( + "Attempting to construct DynRankView from View and new rank, with " + "the new rank being too large."); Mapping::assign(*this, rhs); + m_rank = new_rank; } template - KOKKOS_INLINE_FUNCTION DynRankView& operator=(const View& rhs) { + KOKKOS_FUNCTION DynRankView& operator=(const View& rhs) { using SrcTraits = typename View::traits; using Mapping = Kokkos::Impl::ViewMapping { static_assert(Mapping::is_assignable, "Incompatible View to DynRankView copy assignment"); Mapping::assign(*this, rhs); + m_rank = View::rank(); return *this; } +#endif + + template + KOKKOS_FUNCTION DynRankView(const View& rhs) + : DynRankView(rhs, View::rank()) {} //---------------------------------------- // Allocation tracking properties - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.use_count(); } - - inline const std::string label() const { - return m_track.template get_label(); - } - //---------------------------------------- // Allocation according to allocation properties and array layout // unused arg_layout dimensions must be set to KOKKOS_INVALID_INDEX so that // rank deduction can properly take place + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( - const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), - m_map(), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing DynRankView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout), - Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - } - - // Wrappers - template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, - Impl::DynRankDimTraits:: - template createLayout(arg_prop, arg_layout)), - m_rank(Impl::DynRankDimTraits:: - template computeRank( - arg_prop, arg_layout)) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing DynRankView to wrap user memory must supply matching " - "pointer type"); - } + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} + + template + explicit DynRankView( + const Kokkos::Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout const&> + arg_layout) + : view_type(arg_prop, drdtraits::template createLayout( + arg_prop, arg_layout)), + m_rank(drdtraits::computeRank(arg_prop, arg_layout)) {} //---------------------------------------- // Constructor(s) // Simple dimension-only layout + // We need two variants to avoid calling host function from host device + // function warnings template - explicit inline DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} template - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit DynRankView( const Kokkos::Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - size_t> const arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) + std::enable_if_t::has_pointer, + const size_t> + arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) : DynRankView(arg_prop, typename traits::array_layout( arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)) {} @@ -1188,16 +934,20 @@ class DynRankView : public ViewTraits { //---------------------------------------- // Memory span required to wrap these dimensions. + // FIXME: this function needs to be tested static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, + [[maybe_unused]] const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + // FIXME: check that arg_N7 is not set by user (in debug mode) + return view_type::required_allocation_size(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_INVALID_INDEX, + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, const size_t arg_N2 = KOKKOS_INVALID_INDEX, const size_t arg_N3 = KOKKOS_INVALID_INDEX, @@ -1205,55 +955,38 @@ class DynRankView : public ViewTraits { const size_t arg_N5 = KOKKOS_INVALID_INDEX, const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), arg_N0, - arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( - pointer_type arg_ptr, typename traits::array_layout& arg_layout) - : DynRankView(Kokkos::Impl::ViewCtorProp(arg_ptr), - arg_layout) {} + explicit KOKKOS_FUNCTION DynRankView( + typename view_type::pointer_type arg_ptr, + typename traits::array_layout& arg_layout) + : DynRankView( + Kokkos::Impl::ViewCtorProp( + arg_ptr), + arg_layout) {} //---------------------------------------- // Shared scratch memory constructor - static inline size_t shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - const size_t num_passed_args = - (arg_N0 != KOKKOS_INVALID_INDEX) + (arg_N1 != KOKKOS_INVALID_INDEX) + - (arg_N2 != KOKKOS_INVALID_INDEX) + (arg_N3 != KOKKOS_INVALID_INDEX) + - (arg_N4 != KOKKOS_INVALID_INDEX) + (arg_N5 != KOKKOS_INVALID_INDEX) + - (arg_N6 != KOKKOS_INVALID_INDEX) + (arg_N7 != KOKKOS_INVALID_INDEX); - - if (std::is_void::value && - num_passed_args != traits::rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - {} - - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + // Note: We must pass 7 valid args since view_type is rank 7 + static inline size_t shmem_size( + const size_t arg_N0 = 1, const size_t arg_N1 = 1, const size_t arg_N2 = 1, + const size_t arg_N3 = 1, const size_t arg_N4 = 1, const size_t arg_N5 = 1, + const size_t arg_N6 = 1, const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + return view_type::shmem_size(arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, + arg_N6, arg_N7); } - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const typename traits::array_layout& arg_layout) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(arg_layout) // is this correct? - )))), - arg_layout) {} + : view_type(arg_space, drdtraits::createLayout(arg_layout)), + m_rank(drdtraits::computeRank(arg_layout)) {} - explicit KOKKOS_INLINE_FUNCTION DynRankView( + explicit KOKKOS_FUNCTION DynRankView( const typename traits::execution_space::scratch_memory_space& arg_space, const size_t arg_N0 = KOKKOS_INVALID_INDEX, const size_t arg_N1 = KOKKOS_INVALID_INDEX, @@ -1264,21 +997,38 @@ class DynRankView : public ViewTraits { const size_t arg_N6 = KOKKOS_INVALID_INDEX, const size_t arg_N7 = KOKKOS_INVALID_INDEX) - : DynRankView( - Kokkos::Impl::ViewCtorProp( - reinterpret_cast( - arg_space.get_shmem(map_type::memory_span( - Impl::DynRankDimTraits:: - createLayout(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, - arg_N6, arg_N7)))))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) {} + : DynRankView(arg_space, typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, + arg_N5, arg_N6, arg_N7)) {} + + KOKKOS_FUNCTION constexpr auto layout() const { + switch (rank()) { + case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); + case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); + case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); + case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); + case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); + case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); + case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); + case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); + default: + KOKKOS_IF_ON_HOST( + Kokkos::abort( + std::string( + "Calling DynRankView::layout on DRV of unexpected rank " + + std::to_string(rank())) + .c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "Calling DynRankView::layout on DRV of unexpected rank");) + } + // control flow should never reach here + return view_type::layout(); + } }; template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank( - const DynRankView& DRV) { +KOKKOS_FUNCTION constexpr unsigned rank(const DynRankView& DRV) { return DRV.rank(); } // needed for transition to common constexpr method in view and dynrankview // to return rank @@ -1293,181 +1043,46 @@ struct DynRankSubviewTag {}; } // namespace Impl -namespace Impl { - -template -class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value)), - Kokkos::Impl::DynRankSubviewTag>, - SrcTraits, Args...> { - private: - enum { - RZ = false, - R0 = bool(is_integral_extent<0, Args...>::value), - R1 = bool(is_integral_extent<1, Args...>::value), - R2 = bool(is_integral_extent<2, Args...>::value), - R3 = bool(is_integral_extent<3, Args...>::value), - R4 = bool(is_integral_extent<4, Args...>::value), - R5 = bool(is_integral_extent<5, Args...>::value), - R6 = bool(is_integral_extent<6, Args...>::value) - }; - - enum { - rank = unsigned(R0) + unsigned(R1) + unsigned(R2) + unsigned(R3) + - unsigned(R4) + unsigned(R5) + unsigned(R6) - }; - - using array_layout = Kokkos::LayoutStride; - - using value_type = typename SrcTraits::value_type; - - using data_type = value_type*******; - - public: - using traits_type = Kokkos::ViewTraits; - - using type = - Kokkos::View; - - template - struct apply { - static_assert(Kokkos::is_memory_traits::value); - - using traits_type = - Kokkos::ViewTraits; - - using type = Kokkos::View; - }; - - using dimension = typename SrcTraits::dimension; - - template - struct ExtentGenerator { - KOKKOS_INLINE_FUNCTION - static SubviewExtents<7, rank> generator( - const dimension& dim, Arg0 arg0 = Arg0(), Arg1 arg1 = Arg1(), - Arg2 arg2 = Arg2(), Arg3 arg3 = Arg3(), Arg4 arg4 = Arg4(), - Arg5 arg5 = Arg5(), Arg6 arg6 = Arg6()) { - return SubviewExtents<7, rank>(dim, arg0, arg1, arg2, arg3, arg4, arg5, - arg6); - } - }; - - using ret_type = Kokkos::DynRankView; - - template - KOKKOS_INLINE_FUNCTION static ret_type subview( - const unsigned src_rank, Kokkos::DynRankView const& src, - Args... args) { - using DstType = ViewMapping; - - using DstDimType = std::conditional_t< - (rank == 0), ViewDimension<>, - std::conditional_t< - (rank == 1), ViewDimension<0>, - std::conditional_t< - (rank == 2), ViewDimension<0, 0>, - std::conditional_t< - (rank == 3), ViewDimension<0, 0, 0>, - std::conditional_t< - (rank == 4), ViewDimension<0, 0, 0, 0>, - std::conditional_t< - (rank == 5), ViewDimension<0, 0, 0, 0, 0>, - std::conditional_t< - (rank == 6), ViewDimension<0, 0, 0, 0, 0, 0>, - ViewDimension<0, 0, 0, 0, 0, 0, 0>>>>>>>>; - - using dst_offset_type = ViewOffset; - using dst_handle_type = typename DstType::handle_type; - - ret_type dst; - - const SubviewExtents<7, rank> extents = ExtentGenerator::generator( - src.m_map.m_impl_offset.m_dim, args...); - - dst_offset_type tempdst(src.m_map.m_impl_offset, extents); - - dst.m_track = src.m_track; - - dst.m_map.m_impl_offset.m_dim.N0 = tempdst.m_dim.N0; - dst.m_map.m_impl_offset.m_dim.N1 = tempdst.m_dim.N1; - dst.m_map.m_impl_offset.m_dim.N2 = tempdst.m_dim.N2; - dst.m_map.m_impl_offset.m_dim.N3 = tempdst.m_dim.N3; - dst.m_map.m_impl_offset.m_dim.N4 = tempdst.m_dim.N4; - dst.m_map.m_impl_offset.m_dim.N5 = tempdst.m_dim.N5; - dst.m_map.m_impl_offset.m_dim.N6 = tempdst.m_dim.N6; - - dst.m_map.m_impl_offset.m_stride.S0 = tempdst.m_stride.S0; - dst.m_map.m_impl_offset.m_stride.S1 = tempdst.m_stride.S1; - dst.m_map.m_impl_offset.m_stride.S2 = tempdst.m_stride.S2; - dst.m_map.m_impl_offset.m_stride.S3 = tempdst.m_stride.S3; - dst.m_map.m_impl_offset.m_stride.S4 = tempdst.m_stride.S4; - dst.m_map.m_impl_offset.m_stride.S5 = tempdst.m_stride.S5; - dst.m_map.m_impl_offset.m_stride.S6 = tempdst.m_stride.S6; - - dst.m_map.m_impl_handle = - dst_handle_type(src.m_map.m_impl_handle + - src.m_map.m_impl_offset( - extents.domain_offset(0), extents.domain_offset(1), - extents.domain_offset(2), extents.domain_offset(3), - extents.domain_offset(4), extents.domain_offset(5), - extents.domain_offset(6))); - - dst.m_rank = - (src_rank > 0 ? unsigned(R0) : 0) + (src_rank > 1 ? unsigned(R1) : 0) + - (src_rank > 2 ? unsigned(R2) : 0) + (src_rank > 3 ? unsigned(R3) : 0) + - (src_rank > 4 ? unsigned(R4) : 0) + (src_rank > 5 ? unsigned(R5) : 0) + - (src_rank > 6 ? unsigned(R6) : 0); - - return dst; - } -}; - -} // namespace Impl - template using Subdynrankview = typename Kokkos::Impl::ViewMapping::ret_type; -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subdynrankview(const Kokkos::DynRankView& src, Args... args) { - if (src.rank() > sizeof...(Args)) // allow sizeof...(Args) >= src.rank(), - // ignore the remaining args - { - Kokkos::abort( - "subdynrankview: num of args must be >= rank of the source " - "DynRankView"); - } +template +KOKKOS_INLINE_FUNCTION auto subdynrankview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + auto sub = subview(drv.DownCast(), arg0, arg1, arg2, arg3, arg4, arg5, arg6); + using sub_t = decltype(sub); + size_t new_rank = (drv.rank() > 0 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 1 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 2 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 3 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 4 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 5 && !std::is_integral_v ? 1 : 0) + + (drv.rank() > 6 && !std::is_integral_v ? 1 : 0); - using metafcn = - Kokkos::Impl::ViewMapping, Args...>; - - return metafcn::subview(src.rank(), src, args...); + using return_type = + DynRankView; + return static_cast( + DynRankView( + sub, new_rank)); } - -// Wrapper to allow subview function name -template -KOKKOS_INLINE_FUNCTION Subdynrankview, Args...> -subview(const Kokkos::DynRankView& src, Args... args) { - return subdynrankview(src, args...); +template +KOKKOS_INLINE_FUNCTION auto subview( + const DynRankView& drv, SubArg0 arg0 = SubArg0{}, + SubArg1 arg1 = SubArg1{}, SubArg2 arg2 = SubArg2{}, + SubArg3 arg3 = SubArg3{}, SubArg4 arg4 = SubArg4{}, + SubArg5 arg5 = SubArg5{}, SubArg6 arg6 = SubArg6{}) { + return subdynrankview(drv, arg0, arg1, arg2, arg3, arg4, arg5, arg6); } } // namespace Kokkos @@ -1482,12 +1097,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const DynRankView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && lhs.rank() == rhs.rank() && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && lhs.extent(2) == rhs.extent(2) && @@ -1638,11 +1253,11 @@ namespace Impl { underlying memory, to facilitate implementation of deep_copy() and other routines that are defined on View */ template -KOKKOS_FUNCTION auto as_view_of_rank_n( +KOKKOS_FUNCTION View::type, Args...> +as_view_of_rank_n( DynRankView v, - typename std::enable_if::specialize, void>::value>::type* = - nullptr) { + std::enable_if_t< + std::is_same_v::specialize, void>>*) { if (v.rank() != N) { KOKKOS_IF_ON_HOST( const std::string message = @@ -1653,7 +1268,7 @@ KOKKOS_FUNCTION auto as_view_of_rank_n( Kokkos::abort("Converting DynRankView to a View of mis-matched rank!");) } - auto layout = v.impl_map().layout(); + auto layout = v.DownCast().layout(); if constexpr (std::is_same_v || std::is_same_v || @@ -1691,43 +1306,16 @@ void apply_to_view_of_static_rank(Function&& f, DynRankView a) { } // namespace Impl -template -KOKKOS_INLINE_FUNCTION constexpr auto DynRankView::layout() const -> - typename traits::array_layout { - switch (rank()) { - case 0: return Impl::as_view_of_rank_n<0>(*this).layout(); - case 1: return Impl::as_view_of_rank_n<1>(*this).layout(); - case 2: return Impl::as_view_of_rank_n<2>(*this).layout(); - case 3: return Impl::as_view_of_rank_n<3>(*this).layout(); - case 4: return Impl::as_view_of_rank_n<4>(*this).layout(); - case 5: return Impl::as_view_of_rank_n<5>(*this).layout(); - case 6: return Impl::as_view_of_rank_n<6>(*this).layout(); - case 7: return Impl::as_view_of_rank_n<7>(*this).layout(); - default: - KOKKOS_IF_ON_HOST( - Kokkos::abort( - std::string( - "Calling DynRankView::layout on DRV of unexpected rank " + - std::to_string(rank())) - .c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "Calling DynRankView::layout on DRV of unexpected rank");) - } - // control flow should never reach here - return m_map.layout(); -} - /** \brief Deep copy a value from Host memory into a view. */ template inline void deep_copy( const ExecSpace& e, const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); Impl::apply_to_view_of_static_rank( @@ -1738,8 +1326,8 @@ template inline void deep_copy( const DynRankView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Impl::apply_to_view_of_static_rank([=](auto view) { deep_copy(view, value); }, dst); } @@ -1750,8 +1338,8 @@ inline void deep_copy( const ExecSpace& e, typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(e, dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1759,8 +1347,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const DynRankView& src, - std::enable_if_t::specialize, - void>::value>* = 0) { + std::enable_if_t::specialize, + void>>* = 0) { deep_copy(dst, Impl::as_view_of_rank_n<0>(src)); } @@ -1773,15 +1361,13 @@ inline void deep_copy( template inline void deep_copy( const ExecSpace& exec_space, const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1826,15 +1412,13 @@ inline void deep_copy( template inline void deep_copy( const DstType& dst, const SrcType& src, - std::enable_if_t< - (std::is_void::value && - std::is_void::value && - (Kokkos::is_dyn_rank_view::value || - Kokkos::is_dyn_rank_view::value))>* = nullptr) { - static_assert( - std::is_same::value, - "deep_copy requires non-const destination type"); + std::enable_if_t<(std::is_void_v && + std::is_void_v && + (Kokkos::is_dyn_rank_view::value || + Kokkos::is_dyn_rank_view::value))>* = nullptr) { + static_assert(std::is_same_v, + "deep_copy requires non-const destination type"); switch (rank(dst)) { case 0: @@ -1894,7 +1478,7 @@ struct MirrorDRViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1909,26 +1493,6 @@ struct MirrorDRViewType { std::conditional_t; }; -template -struct MirrorDRVType { - // The incoming view_type - using src_view_type = typename Kokkos::DynRankView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::DynRankView; -}; - } // namespace Impl namespace Impl { @@ -1945,10 +1509,9 @@ inline auto create_mirror(const DynRankView& src, arg_prop, std::string(src.label()).append("_mirror")); if constexpr (Impl::ViewCtorProp::has_memory_space) { - using dst_type = typename Impl::MirrorDRVType< + using dst_type = typename Impl::MirrorDRViewType< typename Impl::ViewCtorProp::memory_space, T, - P...>::view_type; - + P...>::dest_view_type; return dst_type(prop_copy, Impl::reconstructLayout(src.layout(), src.rank())); } else { @@ -1989,7 +1552,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(const Space&, const Kokkos::DynRankView& src) { +inline auto create_mirror(const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{})); } @@ -1999,8 +1563,8 @@ template ::value && std::is_void_v::specialize>>> -auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::DynRankView& src) { +inline auto create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::DynRankView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -2026,12 +1590,12 @@ inline auto create_mirror_view( [[maybe_unused]] const typename Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename DynRankView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename DynRankView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename DynRankView< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename DynRankView::data_type, + typename DynRankView::HostMirror::data_type>) { return typename DynRankView::HostMirror(src); } else { return Kokkos::Impl::choose_create_mirror(src, arg_prop); @@ -2102,7 +1666,7 @@ inline auto create_mirror_view( // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::DynRankView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp index a4b74e246e..caae3f791f 100644 --- a/lib/kokkos/containers/src/Kokkos_DynamicView.hpp +++ b/lib/kokkos/containers/src/Kokkos_DynamicView.hpp @@ -40,10 +40,10 @@ struct ChunkedArrayManager { using pointer_type = ValueType*; using track_type = Kokkos::Impl::SharedAllocationTracker; - ChunkedArrayManager() = default; - ChunkedArrayManager(ChunkedArrayManager const&) = default; - ChunkedArrayManager(ChunkedArrayManager&&) = default; - ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; + ChunkedArrayManager() = default; + ChunkedArrayManager(ChunkedArrayManager const&) = default; + ChunkedArrayManager(ChunkedArrayManager&&) = default; + ChunkedArrayManager& operator=(ChunkedArrayManager&&) = default; ChunkedArrayManager& operator=(const ChunkedArrayManager&) = default; template @@ -129,10 +129,10 @@ struct ChunkedArrayManager { /// allocation template struct Destroy { - Destroy() = default; - Destroy(Destroy&&) = default; - Destroy(const Destroy&) = default; - Destroy& operator=(Destroy&&) = default; + Destroy() = default; + Destroy(Destroy&&) = default; + Destroy(const Destroy&) = default; + Destroy& operator=(Destroy&&) = default; Destroy& operator=(const Destroy&) = default; Destroy(std::string label, value_type** arg_chunk, @@ -250,7 +250,7 @@ class DynamicView : public Kokkos::ViewTraits { // It is assumed that the value_type is trivially copyable; // when this is not the case, potential problems can occur. - static_assert(std::is_void::value, + static_assert(std::is_void_v, "DynamicView only implemented for non-specialized View type"); private: @@ -363,7 +363,7 @@ class DynamicView : public Kokkos::ViewTraits { enum { reference_type_is_lvalue_reference = - std::is_lvalue_reference::value + std::is_lvalue_reference_v }; KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { @@ -463,11 +463,11 @@ class DynamicView : public Kokkos::ViewTraits { //---------------------------------------------------------------------- - ~DynamicView() = default; - DynamicView() = default; - DynamicView(DynamicView&&) = default; - DynamicView(const DynamicView&) = default; - DynamicView& operator=(DynamicView&&) = default; + ~DynamicView() = default; + DynamicView() = default; + DynamicView(DynamicView&&) = default; + DynamicView(const DynamicView&) = default; + DynamicView& operator=(DynamicView&&) = default; DynamicView& operator=(const DynamicView&) = default; template @@ -572,7 +572,7 @@ struct MirrorDynamicViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -665,9 +665,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorDynamicViewType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::DynamicView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::DynamicView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(wi, typename Space::memory_space{})); } @@ -693,14 +693,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::DynamicView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::DynamicView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::DynamicView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::DynamicView::HostMirror(src); } else { @@ -835,21 +835,17 @@ inline void deep_copy(const View& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } template @@ -861,21 +857,17 @@ inline void deep_copy(const Kokkos::Experimental::DynamicView& dst, using dst_execution_space = typename ViewTraits::execution_space; using src_memory_space = typename ViewTraits::memory_space; - enum { - DstExecCanAccessSrc = - Kokkos::SpaceAccessibility::accessible - }; + constexpr bool DstExecCanAccessSrc = + Kokkos::SpaceAccessibility::accessible; + static_assert( + DstExecCanAccessSrc, + "deep_copy given views that would require a temporary allocation"); - if (DstExecCanAccessSrc) { - // Copying data between views in accessible memory spaces and either - // non-contiguous or incompatible shape. - Kokkos::Impl::ViewRemap(dst, src); - Kokkos::fence("Kokkos::deep_copy(DynamicView)"); - } else { - Kokkos::Impl::throw_runtime_exception( - "deep_copy given views that would require a temporary allocation"); - } + // Copying data between views in accessible memory spaces and either + // non-contiguous or incompatible shape. + Kokkos::Impl::ViewRemap(dst, src); + Kokkos::fence("Kokkos::deep_copy(DynamicView)"); } namespace Impl { @@ -964,7 +956,7 @@ struct ViewCopy, // view_alloc template ::specialize>::value>> + std::is_void_v::specialize>>> auto create_mirror_view_and_copy( [[maybe_unused]] const Impl::ViewCtorProp& arg_prop, const Kokkos::Experimental::DynamicView& src) { diff --git a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp index 3adc70b190..cf23c25b86 100644 --- a/lib/kokkos/containers/src/Kokkos_OffsetView.hpp +++ b/lib/kokkos/containers/src/Kokkos_OffsetView.hpp @@ -50,9 +50,9 @@ inline constexpr bool is_offset_view_v = is_offset_view::value; #define KOKKOS_INVALID_INDEX_RANGE \ { KOKKOS_INVALID_OFFSET, KOKKOS_INVALID_OFFSET } -template ::value && - std::is_signed::value, - iType> = 0> +template && std::is_signed_v, + iType> = 0> using IndexRange = Kokkos::Array; using index_list_type = std::initializer_list; @@ -118,11 +118,11 @@ KOKKOS_INLINE_FUNCTION void offsetview_verify_operator_bounds( (enum {LEN = 1024}; char buffer[LEN]; const std::string label = tracker.template get_label(); int n = snprintf(buffer, LEN, - "OffsetView bounds error of view labeled %s (", - label.c_str()); + "OffsetView bounds error of view labeled %s (", + label.c_str()); offsetview_error_operator_bounds<0>(buffer + n, LEN - n, map, begins, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE( (Kokkos::abort("OffsetView bounds error"); (void)tracker;)) @@ -180,44 +180,40 @@ void runtime_check_rank_device(const size_t rank_dynamic, const size_t rank, } // namespace Impl template -class OffsetView : public ViewTraits { - public: - using traits = ViewTraits; - +class OffsetView : public View { private: template friend class OffsetView; - template - friend class View; // FIXME delete this line - template - friend class Kokkos::Impl::ViewMapping; - using map_type = Kokkos::Impl::ViewMapping; - using track_type = Kokkos::Impl::SharedAllocationTracker; + using base_t = View; public: - enum { Rank = map_type::Rank }; - using begins_type = Kokkos::Array; + // typedefs to reduce typing base_t:: further down + using traits = typename base_t::traits; + // FIXME: should be base_t::index_type after refactor + using index_type = typename base_t::memory_space::size_type; + using pointer_type = typename base_t::pointer_type; + + using begins_type = Kokkos::Array; template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t begin(const iType local_dimension) const { - return local_dimension < Rank ? m_begins[local_dimension] - : KOKKOS_INVALID_OFFSET; + return static_cast(local_dimension) < base_t::rank() + ? m_begins[local_dimension] + : KOKKOS_INVALID_OFFSET; } KOKKOS_FUNCTION begins_type begins() const { return m_begins; } template ::value, iType> = 0> + std::enable_if_t, iType> = 0> KOKKOS_FUNCTION int64_t end(const iType local_dimension) const { - return begin(local_dimension) + m_map.extent(local_dimension); + return begin(local_dimension) + base_t::extent(local_dimension); } private: - track_type m_track; - map_type m_map; begins_type m_begins; public: @@ -245,529 +241,60 @@ class OffsetView : public ViewTraits { typename traits::array_layout, typename traits::host_mirror_space>; - //---------------------------------------- - // Domain rank and extents - - /** \brief rank() to be implemented - */ - // KOKKOS_FUNCTION - // static - // constexpr unsigned rank() { return map_type::Rank; } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - extent(const iType& r) const { - return m_map.extent(r); + template + KOKKOS_FUNCTION typename base_t::reference_type offset_operator( + std::integer_sequence, OtherIndexTypes... indices) const { + return base_t::operator()((indices - m_begins[I])...); } - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - int> - extent_int(const iType& r) const { - return static_cast(m_map.extent(r)); - } - - KOKKOS_FUNCTION constexpr typename traits::array_layout layout() const { - return m_map.layout(); - } - - KOKKOS_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_FUNCTION constexpr size_t stride_0() const { return m_map.stride_0(); } - KOKKOS_FUNCTION constexpr size_t stride_1() const { return m_map.stride_1(); } - KOKKOS_FUNCTION constexpr size_t stride_2() const { return m_map.stride_2(); } - KOKKOS_FUNCTION constexpr size_t stride_3() const { return m_map.stride_3(); } - KOKKOS_FUNCTION constexpr size_t stride_4() const { return m_map.stride_4(); } - KOKKOS_FUNCTION constexpr size_t stride_5() const { return m_map.stride_5(); } - KOKKOS_FUNCTION constexpr size_t stride_6() const { return m_map.stride_6(); } - KOKKOS_FUNCTION constexpr size_t stride_7() const { return m_map.stride_7(); } - - template - KOKKOS_FUNCTION constexpr std::enable_if_t::value, - size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_FUNCTION constexpr pointer_type data() const { return m_map.data(); } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_FUNCTION - const Kokkos::Impl::ViewMapping& implementation_map() const { - return m_map; - } - - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); \ - Kokkos::Experimental::Impl::offsetview_verify_operator_bounds< \ - typename traits::memory_space> \ - ARG; - -#else - -#define KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY(ARG) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::OffsetView ERROR: attempt to access inaccessible memory " \ - "space"); - + template +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)) #endif - public: - //------------------------------ - // Rank 0 operator() - - KOKKOS_FORCEINLINE_FUNCTION - reference_type operator()() const { return m_map.reference(); } - //------------------------------ - // Rank 1 operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator()(const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (1 == Rank) && !is_default_map), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.reference(j0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && !is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[j0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && (1 == Rank) && - is_default_map && is_layout_stride), - reference_type> - operator[](const I0& i0) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0)) - const size_t j0 = i0 - m_begins[0]; - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * j0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (2 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - return m_map.reference(j0, j1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && (2 == Rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(const I0& i0, const I1& i1) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY((m_track, m_map, m_begins, i0, i1)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - if constexpr (is_layout_left) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_dim.N0 * j1]; - else - return m_map.m_impl_handle[j0 + m_map.m_impl_offset.m_stride * j1]; - } else if constexpr (is_layout_right) { - if constexpr (traits::rank_dynamic == 0) - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_dim.N1 * j0]; - else - return m_map.m_impl_handle[j1 + m_map.m_impl_offset.m_stride * j0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[j0 * m_map.m_impl_offset.m_stride.S0 + - j1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined(KOKKOS_COMPILER_INTEL) - __builtin_unreachable(); + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator[]( + const OtherIndexType& idx) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert(std::is_convertible_v && + std::is_nothrow_constructible_v && + (base_t::rank() == 1)); #endif + return base_t::operator[](idx - m_begins[0]); } - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2)]; + template +#ifndef KOKKOS_ENABLE_CXX17 + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())) +#endif + KOKKOS_FUNCTION constexpr typename base_t::reference_type operator()( + OtherIndexTypes... indices) const { +#ifdef KOKKOS_ENABLE_CXX17 + static_assert( + (std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && ...) && + (sizeof...(OtherIndexTypes) == base_t::rank())); +#endif + return offset_operator(std::make_index_sequence(), + indices...); } - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (3 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - return m_map.reference(j0, j1, j2); - } + template + KOKKOS_FUNCTION constexpr typename base_t::reference_type access( + OtherIndexTypes... args) const = delete; - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (4 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - return m_map.reference(j0, j1, j2, j3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::are_integral::value && - (5 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - return m_map.reference(j0, j1, j2, j3, j4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (6 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - return m_map.reference(j0, j1, j2, j3, j4, j5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (7 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map - .m_impl_handle[m_map.m_impl_offset(j0, j1, j2, j3, j4, j5, j6, j7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::are_integral::value && - (8 == Rank) && !is_default_map), - reference_type> - operator()(const I0& i0, const I1& i1, const I2& i2, const I3& i3, - const I4& i4, const I5& i5, const I6& i6, const I7& i7) const { - KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY( - (m_track, m_map, m_begins, i0, i1, i2, i3, i4, i5, i6, i7)) - const size_t j0 = i0 - m_begins[0]; - const size_t j1 = i1 - m_begins[1]; - const size_t j2 = i2 - m_begins[2]; - const size_t j3 = i3 - m_begins[3]; - const size_t j4 = i4 - m_begins[4]; - const size_t j5 = i5 - m_begins[5]; - const size_t j6 = i6 - m_begins[6]; - const size_t j7 = i7 - m_begins[7]; - return m_map.reference(j0, j1, j2, j3, j4, j5, j6, j7); - } - -#undef KOKKOS_IMPL_OFFSETVIEW_OPERATOR_VERIFY + //---------------------------------------- //---------------------------------------- // Standard destructor, constructors, and assignment operators - KOKKOS_DEFAULTED_FUNCTION - ~OffsetView() = default; - KOKKOS_FUNCTION - OffsetView() : m_track(), m_map() { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = KOKKOS_INVALID_OFFSET; - } - - KOKKOS_FUNCTION - OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(rhs.m_map), - m_begins(rhs.m_begins) {} - - KOKKOS_FUNCTION - OffsetView(OffsetView&& rhs) - : m_track(std::move(rhs.m_track)), - m_map(std::move(rhs.m_map)), - m_begins(std::move(rhs.m_begins)) {} - - KOKKOS_FUNCTION - OffsetView& operator=(const OffsetView& rhs) { - m_track = rhs.m_track; - m_map = rhs.m_map; - m_begins = rhs.m_begins; - return *this; - } - - KOKKOS_FUNCTION - OffsetView& operator=(OffsetView&& rhs) { - m_track = std::move(rhs.m_track); - m_map = std::move(rhs.m_map); - m_begins = std::move(rhs.m_begins); - return *this; + OffsetView() : base_t() { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = KOKKOS_INVALID_OFFSET; } // interoperability with View @@ -778,20 +305,10 @@ class OffsetView : public ViewTraits { public: KOKKOS_FUNCTION - view_type view() const { - view_type v(m_track, m_map); - return v; - } + view_type view() const { return *this; } template - KOKKOS_FUNCTION OffsetView(const View& aview) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - + KOKKOS_FUNCTION OffsetView(const View& aview) : base_t(aview) { for (size_t i = 0; i < View::rank(); ++i) { m_begins[i] = 0; } @@ -800,19 +317,14 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const index_list_type& minIndices) - : m_track(aview.impl_track()), m_map() { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(aview) { + KOKKOS_IF_ON_HOST( + (Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, aview.label());)) + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -820,27 +332,13 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION OffsetView(const View& aview, const begins_type& beg) - : m_track(aview.impl_track()), m_map(), m_begins(beg) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, aview.impl_map(), m_track); - } + : base_t(aview), m_begins(beg) {} // may assign unmanaged from managed. template KOKKOS_FUNCTION OffsetView(const OffsetView& rhs) - : m_track(rhs.m_track, traits::is_managed), - m_map(), - m_begins(rhs.m_begins) { - using SrcTraits = typename OffsetView::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible OffsetView copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track); // swb what about assign? - } + : base_t(rhs.view()), m_begins(rhs.m_begins) {} private: enum class subtraction_failure { @@ -879,7 +377,7 @@ class OffsetView : public ViewTraits { static subtraction_failure runtime_check_begins_ends_host(const B& begins, const E& ends) { std::string message; - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) message += "begins.size() " "(" + @@ -887,19 +385,19 @@ class OffsetView : public ViewTraits { ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) message += "ends.size() " "(" + - std::to_string(begins.size()) + + std::to_string(ends.size()) + ")" " != Rank " "(" + - std::to_string(Rank) + + std::to_string(base_t::rank()) + ")" "\n"; @@ -941,7 +439,7 @@ class OffsetView : public ViewTraits { message = "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView\n" + message; - Kokkos::Impl::throw_runtime_exception(message); + Kokkos::abort(message.c_str()); } return subtraction_failure::none; @@ -951,11 +449,11 @@ class OffsetView : public ViewTraits { template KOKKOS_FUNCTION static subtraction_failure runtime_check_begins_ends_device( const B& begins, const E& ends) { - if (begins.size() != Rank) + if (begins.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: begins has bad Rank"); - if (ends.size() != Rank) + if (ends.size() != base_t::rank()) Kokkos::abort( "Kokkos::Experimental::OffsetView ERROR: for unmanaged " "OffsetView: ends has bad Rank"); @@ -993,20 +491,25 @@ class OffsetView : public ViewTraits { // Precondition: begins.size() == ends.size() == m_begins.size() == Rank template KOKKOS_FUNCTION OffsetView(const pointer_type& p, const B& begins_, - const E& ends_, - subtraction_failure) - : m_track() // no tracking - , - m_map(Kokkos::Impl::ViewCtorProp(p), - typename traits::array_layout( - Rank > 0 ? at(ends_, 0) - at(begins_, 0) : 0, - Rank > 1 ? at(ends_, 1) - at(begins_, 1) : 0, - Rank > 2 ? at(ends_, 2) - at(begins_, 2) : 0, - Rank > 3 ? at(ends_, 3) - at(begins_, 3) : 0, - Rank > 4 ? at(ends_, 4) - at(begins_, 4) : 0, - Rank > 5 ? at(ends_, 5) - at(begins_, 5) : 0, - Rank > 6 ? at(ends_, 6) - at(begins_, 6) : 0, - Rank > 7 ? at(ends_, 7) - at(begins_, 7) : 0)) { + const E& ends_, subtraction_failure) + : base_t(Kokkos::view_wrap(p), + typename traits::array_layout( + base_t::rank() > 0 ? at(ends_, 0) - at(begins_, 0) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 1 ? at(ends_, 1) - at(begins_, 1) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 2 ? at(ends_, 2) - at(begins_, 2) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 3 ? at(ends_, 3) - at(begins_, 3) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 4 ? at(ends_, 4) - at(begins_, 4) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 5 ? at(ends_, 5) - at(begins_, 5) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 6 ? at(ends_, 6) - at(begins_, 6) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + base_t::rank() > 7 ? at(ends_, 7) - at(begins_, 7) + : KOKKOS_IMPL_CTOR_DEFAULT_ARG)) { for (size_t i = 0; i != m_begins.size(); ++i) { m_begins[i] = at(begins_, i); }; @@ -1040,15 +543,6 @@ class OffsetView : public ViewTraits { : OffsetView(p, begins_, ends_, runtime_check_begins_ends(begins_, ends_)) {} - //---------------------------------------- - // Allocation tracking properties - KOKKOS_FUNCTION - int use_count() const { return m_track.use_count(); } - - const std::string label() const { - return m_track.template get_label(); - } - // Choosing std::pair as type for the arguments allows constructing an // OffsetView using list initialization syntax, e.g., // OffsetView dummy("dummy", {-1, 3}, {-2,2}); @@ -1070,18 +564,34 @@ class OffsetView : public ViewTraits { const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE ) - : OffsetView( - Kokkos::Impl::ViewCtorProp(arg_label), - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(Kokkos::Impl::ViewCtorProp(arg_label), + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG - 1 + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit OffsetView( @@ -1094,18 +604,34 @@ class OffsetView : public ViewTraits { const std::pair range5 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range6 = KOKKOS_INVALID_INDEX_RANGE, const std::pair range7 = KOKKOS_INVALID_INDEX_RANGE) - : OffsetView( - arg_prop, - typename traits::array_layout(range0.second - range0.first + 1, - range1.second - range1.first + 1, - range2.second - range2.first + 1, - range3.second - range3.first + 1, - range4.second - range4.first + 1, - range5.second - range5.first + 1, - range6.second - range6.first + 1, - range7.second - range7.first + 1), - {range0.first, range1.first, range2.first, range3.first, - range4.first, range5.first, range6.first, range7.first}) {} + : OffsetView(arg_prop, + typename traits::array_layout( + range0.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range0.second - range0.first + 1, + range1.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range1.second - range1.first + 1, + range2.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range2.second - range2.first + 1, + range3.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range3.second - range3.first + 1, + range4.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range4.second - range4.first + 1, + range5.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range5.second - range5.first + 1, + range6.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range6.second - range6.first + 1, + range7.first == KOKKOS_INVALID_OFFSET + ? KOKKOS_IMPL_CTOR_DEFAULT_ARG + : range7.second - range7.first + 1), + {range0.first, range1.first, range2.first, range3.first, + range4.first, range5.first, range6.first, range7.first}) {} template explicit KOKKOS_FUNCTION OffsetView( @@ -1113,9 +639,14 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { + : base_t(arg_prop, arg_layout) { + KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( + traits::rank_dynamic, base_t::rank(), minIndices, + base_t::label());)) + + KOKKOS_IF_ON_DEVICE( + (Kokkos::Experimental::Impl::runtime_check_rank_device( + traits::rank_dynamic, base_t::rank(), minIndices);)) for (size_t i = 0; i < minIndices.size(); ++i) { m_begins[i] = minIndices.begin()[i]; } @@ -1132,42 +663,9 @@ class OffsetView : public ViewTraits { std::enable_if_t::has_pointer, typename traits::array_layout> const& arg_layout, const index_list_type minIndices) - : m_track(), - m_map() - - { - for (size_t i = 0; i < Rank; ++i) m_begins[i] = minIndices.begin()[i]; - - // Copy the input allocation properties with possibly defaulted properties - auto prop_copy = Kokkos::Impl::with_properties_if_unset( - arg_prop, std::string{}, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "OffsetView allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing OffsetView and initializing data with uninitialized " - "execution space"); - } - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, - Kokkos::Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.assign_allocated_record_to_uninitialized(record); - - KOKKOS_IF_ON_HOST((Kokkos::Experimental::Impl::runtime_check_rank_host( - traits::rank_dynamic, Rank, minIndices, label());)) - - KOKKOS_IF_ON_DEVICE((Kokkos::Experimental::Impl::runtime_check_rank_device( - traits::rank_dynamic, Rank, minIndices);)) + : base_t(arg_prop, arg_layout) { + for (size_t i = 0; i < base_t::rank(); ++i) + m_begins[i] = minIndices.begin()[i]; } }; @@ -1177,7 +675,7 @@ class OffsetView : public ViewTraits { */ template KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { - return V.Rank; + return V.rank(); } // Temporary until added to view //---------------------------------------------------------------------------- @@ -1185,8 +683,8 @@ KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const OffsetView& V) { namespace Impl { template -KOKKOS_INLINE_FUNCTION std::enable_if_t::value, T> -shift_input(const T arg, const int64_t offset) { +KOKKOS_INLINE_FUNCTION std::enable_if_t, T> shift_input( + const T arg, const int64_t offset) { return arg - offset; } @@ -1197,13 +695,13 @@ Kokkos::ALL_t shift_input(const Kokkos::ALL_t arg, const int64_t /*offset*/) { template KOKKOS_INLINE_FUNCTION - std::enable_if_t::value, Kokkos::pair> + std::enable_if_t, Kokkos::pair> shift_input(const Kokkos::pair arg, const int64_t offset) { return Kokkos::make_pair(arg.first - offset, arg.second - offset); } template -inline std::enable_if_t::value, std::pair> -shift_input(const std::pair arg, const int64_t offset) { +inline std::enable_if_t, std::pair> shift_input( + const std::pair arg, const int64_t offset) { return std::make_pair(arg.first - offset, arg.second - offset); } @@ -1212,7 +710,7 @@ KOKKOS_INLINE_FUNCTION void map_arg_to_new_begin( const size_t i, Kokkos::Array& subviewBegins, std::enable_if_t shiftedArg, const Arg arg, const A viewBegins, size_t& counter) { - if (!std::is_integral::value) { + if (!std::is_integral_v) { subviewBegins[counter] = shiftedArg == arg ? viewBegins[i] : 0; counter++; } @@ -1621,7 +1119,7 @@ KOKKOS_INLINE_FUNCTION ViewTraits, Args...>::type>::type subview(const OffsetView& src, Args... args) { static_assert( - OffsetView::Rank == sizeof...(Args), + OffsetView::rank() == sizeof...(Args), "subview requires one argument for each source OffsetView rank"); return Kokkos::Experimental::Impl::subview_offset(src, args...); @@ -1641,12 +1139,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const OffsetView& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1672,12 +1170,12 @@ KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, using lhs_traits = ViewTraits; using rhs_traits = ViewTraits; - return std::is_same::value && - std::is_same::value && - std::is_same::value && + return std::is_same_v && + std::is_same_v && + std::is_same_v && unsigned(lhs_traits::rank) == unsigned(rhs_traits::rank) && lhs.data() == rhs.data() && lhs.span() == rhs.span() && lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && @@ -1704,11 +1202,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::non_const_value_type, - typename ViewTraits::value_type>::value, + std::is_same_v::non_const_value_type, + typename ViewTraits::value_type>, "deep_copy requires non-const type"); auto dstView = dst.view(); @@ -1719,11 +1217,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1733,11 +1231,11 @@ template inline void deep_copy( const Experimental::OffsetView& dst, const View& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); auto dstView = dst.view(); @@ -1748,11 +1246,11 @@ template inline void deep_copy( const View& dst, const Experimental::OffsetView& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { static_assert( - std::is_same::value_type, - typename ViewTraits::non_const_value_type>::value, + std::is_same_v::value_type, + typename ViewTraits::non_const_value_type>, "deep_copy requires matching non-const destination type"); Kokkos::deep_copy(dst, value.view()); @@ -1770,7 +1268,7 @@ struct MirrorOffsetViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -1786,27 +1284,6 @@ struct MirrorOffsetViewType { std::conditional_t; }; -template -struct MirrorOffsetType { - // The incoming view_type - using src_view_type = typename Kokkos::Experimental::OffsetView; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it.) - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = - Kokkos::Experimental::OffsetView; -}; - } // namespace Impl namespace Impl { @@ -1825,10 +1302,12 @@ inline auto create_mirror(const Kokkos::Experimental::OffsetView& src, auto prop_copy = Impl::with_properties_if_unset( arg_prop, std::string(src.label()).append("_mirror")); - return typename Kokkos::Impl::MirrorOffsetType::view_type( - prop_copy, src.layout(), - {src.begin(0), src.begin(1), src.begin(2), src.begin(3), src.begin(4), - src.begin(5), src.begin(6), src.begin(7)}); + return typename Kokkos::Impl::MirrorOffsetViewType< + Space, T, P...>::dest_view_type(prop_copy, src.layout(), + {src.begin(0), src.begin(1), + src.begin(2), src.begin(3), + src.begin(4), src.begin(5), + src.begin(6), src.begin(7)}); } else { return typename Kokkos::Experimental::OffsetView::HostMirror( Kokkos::create_mirror(arg_prop, src.view()), src.begins()); @@ -1877,9 +1356,9 @@ template ::value && std::is_void_v::specialize>>> -typename Kokkos::Impl::MirrorOffsetType::view_type -create_mirror(Kokkos::Impl::WithoutInitializing_t wi, const Space&, - const Kokkos::Experimental::OffsetView& src) { +inline auto create_mirror( + Kokkos::Impl::WithoutInitializing_t wi, const Space&, + const Kokkos::Experimental::OffsetView& src) { return Impl::create_mirror( src, Kokkos::view_alloc(typename Space::memory_space{}, wi)); } @@ -1905,14 +1384,14 @@ inline auto create_mirror_view( const Kokkos::Experimental::OffsetView& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::Experimental::OffsetView< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::memory_space> && + std::is_same_v::data_type, + typename Kokkos::Experimental::OffsetView< + T, P...>::HostMirror::data_type>) { return typename Kokkos::Experimental::OffsetView::HostMirror(src); } else { diff --git a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp index 9d04cf6acd..52af567c61 100644 --- a/lib/kokkos/containers/src/Kokkos_ScatterView.hpp +++ b/lib/kokkos/containers/src/Kokkos_ScatterView.hpp @@ -184,16 +184,16 @@ struct DefaultContribution -struct DefaultDuplication { +struct DefaultDuplication { using type = Kokkos::Experimental::ScatterNonDuplicated; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; template <> -struct DefaultContribution { using type = Kokkos::Experimental::ScatterAtomic; }; @@ -532,32 +532,56 @@ void args_to_array(size_t* array, int pos, T dim0, Dims... dims) { subview where the index specified is the largest-stride one. */ template struct Slice { - using next = Slice; - using value_type = typename next::value_type; - - static value_type get(V const& src, const size_t i, Args... args) { + using next = Slice; + static auto get(V const& src, const size_t i, Args... args) { return next::get(src, i, Kokkos::ALL, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, i, args...); } }; template struct Slice { - using value_type = - typename Kokkos::Impl::ViewMapping::type; - static value_type get(V const& src, const size_t i, Args... args) { + static auto get(V const& src, const size_t i, Args... args) { return Kokkos::subview(src, args..., i); } }; +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, i, args...); + } +}; + +template +struct Slice, 1, V, Args...> { + static auto get(V const& src, const size_t i, Args... args) { + return Kokkos::subview(src, args..., i); + } +}; +#endif + template struct ReduceDuplicates; @@ -905,7 +929,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(arg); @@ -1028,10 +1052,7 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutRight, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1233,8 +1254,8 @@ class ScatterView::value_type - subview() const { + auto subview() const { return Kokkos::Impl::Experimental::Slice< Kokkos::LayoutLeft, internal_view_type::rank, internal_view_type>::get(internal_view, 0); @@ -1460,7 +1478,7 @@ class ScatterAccess KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - view_type::original_view_type::rank == 1 && std::is_integral::value, + std::is_integral_v && view_type::original_view_type::rank == 1, value_type> operator[](Arg arg) const { return view.at(thread_id, arg); @@ -1470,9 +1488,9 @@ class ScatterAccess::array_layout, typename ViewTraits::device_type, Op, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>, std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultContribution< typename ViewTraits::execution_space, typename std::conditional_t< - std::is_void::value, + std::is_void_v, typename Kokkos::Impl::Experimental::DefaultDuplication< typename ViewTraits::execution_space>::type, Duplication>>::type, diff --git a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp index 8ce868cac2..ec1b8905c7 100644 --- a/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp +++ b/lib/kokkos/containers/src/Kokkos_StaticCrsGraph.hpp @@ -190,7 +190,7 @@ struct GraphRowViewConst { const typename GraphType::entries_type& colidx_in, const ordinal_type& stride, const ordinal_type& count, const OffsetType& idx, - const std::enable_if_t::value, int>& = 0) + const std::enable_if_t, int>& = 0) : colidx_(&colidx_in(idx)), stride_(stride), length(count) {} /// \brief Number of entries in the row. diff --git a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp index c3a8b67df8..4f47051a5c 100644 --- a/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp +++ b/lib/kokkos/containers/src/Kokkos_UnorderedMap.hpp @@ -34,7 +34,7 @@ #include #include -#include +#include #include @@ -746,7 +746,7 @@ class UnorderedMap { /// 'const value_type' via Cuda texture fetch must return by value. template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_void::value, // !is_set + !std::is_void_v, // !is_set std::conditional_t> value_at(size_type i) const { KOKKOS_EXPECTS(i < capacity()); @@ -808,8 +808,8 @@ class UnorderedMap { // Re-allocate the views of the calling UnorderedMap according to src // capacity, and deep copy the src data. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> create_copy_view( UnorderedMap const &src) { if (m_hash_lists.data() != src.m_hash_lists.data()) { @@ -821,8 +821,8 @@ class UnorderedMap { // Allocate views of the calling UnorderedMap with the same capacity as the // src. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> allocate_view( UnorderedMap const &src) { insertable_map_type tmp; @@ -852,8 +852,8 @@ class UnorderedMap { // Deep copy view data from src. This requires that the src capacity is // identical to the capacity of the calling UnorderedMap. template - std::enable_if_t, key_type>::value && - std::is_same, value_type>::value> + std::enable_if_t, key_type> && + std::is_same_v, value_type>> deep_copy_view( UnorderedMap const &src) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 diff --git a/lib/kokkos/containers/src/Kokkos_Vector.hpp b/lib/kokkos/containers/src/Kokkos_Vector.hpp index 88109fb0ba..83ccfbf630 100644 --- a/lib/kokkos/containers/src/Kokkos_Vector.hpp +++ b/lib/kokkos/containers/src/Kokkos_Vector.hpp @@ -172,9 +172,8 @@ class KOKKOS_DEPRECATED vector private: template - struct impl_is_input_iterator - : /* TODO replace this */ std::bool_constant< - !std::is_convertible::value> {}; + struct impl_is_input_iterator : /* TODO replace this */ std::bool_constant< + !std::is_convertible_v> {}; public: // TODO: can use detection idiom to generate better error message here later diff --git a/lib/kokkos/containers/unit_tests/CMakeLists.txt b/lib/kokkos/containers/unit_tests/CMakeLists.txt index e69e46bb6a..6255a86c46 100644 --- a/lib/kokkos/containers/unit_tests/CMakeLists.txt +++ b/lib/kokkos/containers/unit_tests/CMakeLists.txt @@ -1,8 +1,7 @@ - -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_BINARY_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) -KOKKOS_INCLUDE_DIRECTORIES(${CMAKE_CURRENT_SOURCE_DIR}/../src ) -KOKKOS_INCLUDE_DIRECTORIES(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR}) +kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) +kokkos_include_directories(${CMAKE_CURRENT_SOURCE_DIR}/../src) +kokkos_include_directories(${KOKKOS_SOURCE_DIR}/core/unit_test/category_files) foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) string(TOUPPER ${Tag} DEVICE) @@ -12,57 +11,49 @@ foreach(Tag Threads;Serial;OpenMP;HPX;Cuda;HIP;SYCL) set(UnitTestSources UnitTestMain.cpp) set(dir ${CMAKE_CURRENT_BINARY_DIR}/${dir}) file(MAKE_DIRECTORY ${dir}) - foreach(Name - Bitset - DualView - DynamicView - DynViewAPI_generic - DynViewAPI_rank12345 - DynViewAPI_rank67 - ErrorReporter - OffsetView - ScatterView - StaticCrsGraph - WithoutInitializing - UnorderedMap - Vector - ViewCtorPropEmbeddedDim - ) + foreach( + Name + Bitset + DualView + DynamicView + DynViewAPI_generic + DynViewAPI_rank12345 + DynViewAPI_rank67 + DynRankView_TeamScratch + ErrorReporter + OffsetView + ScatterView + StaticCrsGraph + WithoutInitializing + UnorderedMap + Vector + ViewCtorPropEmbeddedDim + ) if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4 AND Name STREQUAL "Vector") continue() # skip Kokkos::vector test if deprecated code 4 is not enabled endif() # Write to a temporary intermediate file and call configure_file to avoid # updating timestamps triggering unnecessary rebuilds on subsequent cmake runs. set(file ${dir}/Test${Tag}_${Name}.cpp) - file(WRITE ${dir}/dummy.cpp - "#include \n" - "#include \n" - ) + file(WRITE ${dir}/dummy.cpp "#include \n" "#include \n") configure_file(${dir}/dummy.cpp ${file}) list(APPEND UnitTestSources ${file}) endforeach() #fatal error C1128: number of sections exceeded object file format limit: compile with /bigobj if(KOKKOS_ENABLE_CUDA AND WIN32) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_DynViewAPI_generic.cpp) endif() # FIXME_NVHPC: NVC++-S-0000-Internal compiler error. extractor: bad opc 0 if(KOKKOS_ENABLE_CUDA AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - LIST(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) + list(REMOVE_ITEM UnitTestSources ${dir}/TestCuda_WithoutInitializing.cpp) endif() - KOKKOS_ADD_EXECUTABLE_AND_TEST(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) + kokkos_add_executable_and_test(ContainersUnitTest_${Tag} SOURCES ${UnitTestSources}) endif() endforeach() -SET(COMPILE_ONLY_SOURCES - TestCreateMirror.cpp - TestDualViewParameterPack.cpp - TestIsViewTrait.cpp -) -KOKKOS_ADD_EXECUTABLE( - ContainersTestCompileOnly - SOURCES - TestCompileMain.cpp - ${COMPILE_ONLY_SOURCES} +set(COMPILE_ONLY_SOURCES TestCreateMirror.cpp TestDualViewParameterPack.cpp TestIsViewTrait.cpp + TestDynRankViewTypedefs.cpp ) +kokkos_add_executable(ContainersTestCompileOnly SOURCES TestCompileMain.cpp ${COMPILE_ONLY_SOURCES}) diff --git a/lib/kokkos/containers/unit_tests/TestBitset.hpp b/lib/kokkos/containers/unit_tests/TestBitset.hpp index 9923453f72..91dc1710e5 100644 --- a/lib/kokkos/containers/unit_tests/TestBitset.hpp +++ b/lib/kokkos/containers/unit_tests/TestBitset.hpp @@ -39,7 +39,7 @@ struct TestBitset { TestBitset(bitset_type const& bitset) : m_bitset(bitset) {} - unsigned testit(unsigned collisions) { + unsigned testit(unsigned long long collisions) { execution_space().fence(); unsigned count = 0; diff --git a/lib/kokkos/containers/unit_tests/TestDualView.hpp b/lib/kokkos/containers/unit_tests/TestDualView.hpp index 2512cb5c49..5d03e6202a 100644 --- a/lib/kokkos/containers/unit_tests/TestDualView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDualView.hpp @@ -71,7 +71,7 @@ struct test_dualview_copy_construction_and_assignment { using SrcViewType = Kokkos::DualView; using DstViewType = - Kokkos::DualView; + Kokkos::DualView; SrcViewType a("A", n, m); @@ -520,58 +520,26 @@ namespace { * that we keep the semantics of UVM DualViews intact. */ // modify if we have other UVM enabled backends -#if defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_SYCL) || \ - defined(KOKKOS_ENABLE_HIP) // OR other UVM builds -#define UVM_ENABLED_BUILD -#endif -#ifdef UVM_ENABLED_BUILD -template -struct UVMSpaceFor; -#endif - -#ifdef KOKKOS_ENABLE_CUDA // specific to CUDA -template <> -struct UVMSpaceFor { - using type = Kokkos::CudaUVMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_SYCL // specific to SYCL -template <> -struct UVMSpaceFor { - using type = Kokkos::Experimental::SYCLSharedUSMSpace; -}; -#endif - -#ifdef KOKKOS_ENABLE_HIP // specific to HIP -template <> -struct UVMSpaceFor { - using type = Kokkos::HIPManagedSpace; -}; -#endif - -#ifdef UVM_ENABLED_BUILD -template <> -struct UVMSpaceFor { - using type = typename UVMSpaceFor::type; -}; +#ifdef KOKKOS_HAS_SHARED_SPACE +template +using TestSharedSpace = Kokkos::SharedSpace; #else -template -struct UVMSpaceFor { - using type = typename ExecSpace::memory_space; -}; +template +using TestSharedSpace = typename ExecutionSpace::memory_space; #endif using ExecSpace = Kokkos::DefaultExecutionSpace; -using MemSpace = typename UVMSpaceFor::type; +using MemSpace = TestSharedSpace; using DeviceType = Kokkos::Device; using DualViewType = Kokkos::DualView; -using d_device = DeviceType; -using h_device = Kokkos::Device< - Kokkos::DefaultHostExecutionSpace, - typename UVMSpaceFor::type>; +using ConstDualViewType = + Kokkos::DualView; +using d_device = DeviceType; +using h_device = + Kokkos::Device>; TEST(TEST_CATEGORY, dualview_device_correct_kokkos_device) { DualViewType dv("myView", 100); @@ -635,14 +603,69 @@ TEST(TEST_CATEGORY, dualview_template_views_return_correct_executionspace_views) { DualViewType dv("myView", 100); dv.clear_sync_state(); - using hvt = decltype(dv.view()); - using dvt = decltype(dv.view()); + using hvt = decltype(dv.view()); + using dvt = decltype(dv.view()); ASSERT_STREQ(Kokkos::DefaultExecutionSpace::name(), dvt::device_type::execution_space::name()); ASSERT_STREQ(Kokkos::DefaultHostExecutionSpace::name(), hvt::device_type::execution_space::name()); } +TEST(TEST_CATEGORY, + dualview_template_views_return_correct_views_from_const_dual_view) { + DualViewType dv("myView", 100); + ConstDualViewType const_dv = dv; + dv.clear_sync_state(); + ASSERT_EQ(dv.view(), + const_dv.view()); + ASSERT_EQ(dv.view(), + const_dv.view()); +} + +// User-defined types with a View data member, only host-constructible +template +class S { + V v_; + + public: + template + S(std::string label, Extents... extents) : v_(std::move(label), extents...) {} + S() : v_("v", 10) {} +}; + +template +auto initialize_view_of_views() { + Kokkos::DualView dv_v( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 3u); + + V v("v", 2); + V w("w", 2); + dv_v.h_view(0) = v; + dv_v.h_view(1) = w; + + dv_v.modify_host(); + dv_v.sync_device(); + + return dv_v; +} + +TEST(TEST_CATEGORY, dualview_sequential_host_init) { + auto dv_v = initialize_view_of_views>(); + dv_v.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv_v.d_view.size(), 2u); + ASSERT_EQ(dv_v.h_view.size(), 2u); + + initialize_view_of_views>>(); + + Kokkos::DualView dv( + Kokkos::view_alloc("myView", Kokkos::SequentialHostInit), 1u); + dv.resize(Kokkos::view_alloc(Kokkos::SequentialHostInit), 2u); + ASSERT_EQ(dv.d_view.size(), 2u); + ASSERT_EQ(dv.h_view.size(), 2u); + dv.realloc(Kokkos::view_alloc(Kokkos::SequentialHostInit), 3u); + ASSERT_EQ(dv.d_view.size(), 3u); + ASSERT_EQ(dv.h_view.size(), 3u); +} } // anonymous namespace } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp new file mode 100644 index 0000000000..95117a22e6 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankViewTypedefs.cpp @@ -0,0 +1,260 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include +#include + +namespace { + +// clang-format off +template +struct data_analysis { + using data_type = DataType; + using const_data_type = const DataType; + using runtime_data_type = DataType; + using runtime_const_data_type = const DataType; + using non_const_data_type = std::remove_const_t; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type*; + using const_data_type = typename data_analysis::const_data_type*; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type*; +}; + +template +struct data_analysis { + using data_type = typename data_analysis::data_type[N]; + using const_data_type = typename data_analysis::const_data_type[N]; + using runtime_data_type = typename data_analysis::runtime_data_type*; + using runtime_const_data_type = typename data_analysis::runtime_const_data_type*; + using non_const_data_type = typename data_analysis::non_const_data_type[N]; +}; + +template +constexpr bool test_view_typedefs_impl() { + // ======================== + // inherited from ViewTraits + // ======================== + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + + // FIXME: these should be deprecated and for proper testing (I.e. where this is different from data_type) + // we would need ensemble types which use the hidden View dimension facility of View (i.e. which make + // "specialize" not void) + static_assert(std::is_same_v); + static_assert(std::is_same_v::const_data_type>); + static_assert(std::is_same_v::non_const_data_type>); + static_assert(std::is_same_v); + + // FIXME: value_type definition conflicts with mdspan value_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + + // FIXME: should maybe be deprecated + static_assert(std::is_same_v); + + // FIXME: should be deprecated and is some complicated impl type + // static_assert(!std::is_void_v); + + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should be deprecated in favor of reference + static_assert(std::is_same_v); + // FIXME: should be deprecated in favor of data_handle_type + static_assert(std::is_same_v); + + // ========================================= + // in Legacy View: some helper View variants + // ========================================= + + // FIXME: in contrast to View, hooks_policy is not propagated + static_assert(std::is_same_v); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + static_assert(std::is_same_v>); + +/* FIXME: these don't exist in DynRankView, should they? + using uniform_layout_type = std::conditional_t), + Kokkos::LayoutLeft, Layout>; + + // Uhm uniformtype removes all memorytraits? + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + typename ViewType::device_type, Kokkos::MemoryTraits<0>>>); + + using anonymous_device_type = Kokkos::Device; + static_assert(std::is_same_v>>); + static_assert(std::is_same_v>>); + static_assert(std::is_same_v::runtime_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); + static_assert(std::is_same_v::runtime_const_data_type, uniform_layout_type, + anonymous_device_type, Kokkos::MemoryTraits<0>>>); +*/ + + // ================================== + // mdspan compatibility + // ================================== + + // FIXME: This typedef caused some weird issue with MSVC+NVCC + // static_assert(std::is_same_v); + // FIXME: Not supported yet + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + // static_assert(std::is_same_v); + + static_assert(std::is_same_v); + // FIXME: should be remove_const_t + static_assert(std::is_same_v); + // FIXME: should be extents_type::index_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + + // FIXME: should come from accessor_type + static_assert(std::is_same_v); + static_assert(std::is_same_v); + return true; +} + +// Helper function to unpack data type and other args from the View, and pass them on +template +struct ViewParams {}; + +template +constexpr bool test_view_typedefs(ViewParams) { + return test_view_typedefs_impl, Kokkos::ViewTraits, + T, L, S, M, HostMirrorSpace, ValueType, ReferenceType>(); +} + + +constexpr bool is_host_exec = std::is_same_v; + +#if defined(KOKKOS_ENABLE_CUDA_UVM) || defined(KOKKOS_ENABLE_IMPL_CUDA_UNIFIED_MEMORY) || defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +constexpr bool has_unified_mem_space = true; +#else +constexpr bool has_unified_mem_space = false; +#endif + +// The test take explicit template arguments for: LayoutType, Space, MemoryTraits, HostMirrorSpace, ValueType, ReferenceType +// The ViewParams is just a type pack for the View template arguments + +// Kokkos::View +namespace TestInt { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestIntDefaultExecutionSpace { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + // HostMirrorSpace is a mess so: if the default exec is a host exec, it is HostSpace (note difference from View ...) + using host_mirror_space = std::conditional_t>; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View +namespace TestFloatPPHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::HostSpace; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams{})); +} + +// Kokkos::View> +namespace TestFloatPPDeviceDefaultHostExecHostSpace { + using layout_type = Kokkos::LayoutRight; + using space = Kokkos::Device; + using memory_traits = Kokkos::MemoryTraits<0>; + using host_mirror_space = Kokkos::HostSpace; + static_assert(test_view_typedefs( + ViewParams>{})); +} + +// Kokkos::View> +namespace TestIntAtomic { + using layout_type = Kokkos::DefaultExecutionSpace::array_layout; + using space = Kokkos::DefaultExecutionSpace; + using memory_traits = Kokkos::MemoryTraits; + // HostMirrorSpace is a mess so: if the default exec is a host exec, that is it + using host_mirror_space = std::conditional_t>>; + static_assert(test_view_typedefs>>>( + ViewParams>{})); +} +// clang-format on +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp new file mode 100644 index 0000000000..e5f8860de7 --- /dev/null +++ b/lib/kokkos/containers/unit_tests/TestDynRankView_TeamScratch.hpp @@ -0,0 +1,72 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#include + +#include + +namespace { + +void test_dyn_rank_view_team_scratch() { + using execution_space = TEST_EXECSPACE; + using memory_space = execution_space::scratch_memory_space; + using drv_type = Kokkos::DynRankView; + using policy_type = Kokkos::TeamPolicy; + using team_type = policy_type::member_type; + + int N0 = 10, N1 = 4, N2 = 3; + size_t shmem_size = drv_type::shmem_size(N0, N1, N2); + ASSERT_GE(shmem_size, N0 * N1 * N2 * sizeof(int)); + + Kokkos::View> + errors("errors"); + auto policy = policy_type(1, Kokkos::AUTO) + .set_scratch_size(0, Kokkos::PerTeam(shmem_size)); + Kokkos::parallel_for( + policy, KOKKOS_LAMBDA(const team_type& team) { + drv_type scr(team.team_scratch(0), N0, N1, N2); + // Control that the code ran at all + if (scr.rank() != 3) errors() |= 1u; + if (scr.extent_int(0) != N0) errors() |= 2u; + if (scr.extent_int(1) != N1) errors() |= 4u; + if (scr.extent_int(2) != N2) errors() |= 8u; + Kokkos::parallel_for( + Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { scr(i, j, k) = i * 100 + j * 10 + k; }); + team.team_barrier(); + Kokkos::parallel_for(Kokkos::TeamThreadMDRange(team, N0, N1, N2), + [=](int i, int j, int k) { + if (scr(i, j, k) != i * 100 + j * 10 + k) + errors() |= 16u; + }); + errors() |= 256u; + }); + unsigned h_errors = 0; + Kokkos::deep_copy(h_errors, errors); + + ASSERT_EQ((h_errors & 1u), 0u) << "Rank mismatch"; + ASSERT_EQ((h_errors & 2u), 0u) << "extent 0 mismatch"; + ASSERT_EQ((h_errors & 4u), 0u) << "extent 1 mismatch"; + ASSERT_EQ((h_errors & 8u), 0u) << "extent 2 mismatch"; + ASSERT_EQ((h_errors & 16u), 0u) << "data access incorrect"; + ASSERT_EQ(h_errors, 256u); +} + +TEST(TEST_CATEGORY, dyn_rank_view_team_scratch) { + test_dyn_rank_view_team_scratch(); +} + +} // namespace diff --git a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp index 4ecb6cf25c..930c76c32c 100644 --- a/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynViewAPI.hpp @@ -792,9 +792,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -817,9 +816,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -846,9 +844,8 @@ class TestDynViewAPI { int equal_ptr_h2_d = a_h2.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value - ? 1 - : 0; + std::is_same_v ? 1 + : 0; ASSERT_EQ(equal_ptr_h_h2, 1); ASSERT_EQ(equal_ptr_h_d, is_same_memspace); ASSERT_EQ(equal_ptr_h2_d, is_same_memspace); @@ -879,8 +876,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -915,8 +911,7 @@ class TestDynViewAPI { int equal_ptr_h3_d = a_h3.data() == a_d.data() ? 1 : 0; int is_same_memspace = - std::is_same::value + std::is_same_v ? 1 : 0; ASSERT_EQ(equal_ptr_h_h2, 1); @@ -943,8 +938,6 @@ class TestDynViewAPI { dView0 d("d"); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) - // Rank 0 Kokkos::resize(d); @@ -1121,8 +1114,6 @@ class TestDynViewAPI { Kokkos::deep_copy(error_flag_host, error_flag); ASSERT_EQ(error_flag_host(), 0); #endif // MDRangePolict Rank < 7 - -#endif // defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) } static void run_test_scalar() { diff --git a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp index c8f8fed3b8..94ccea86eb 100644 --- a/lib/kokkos/containers/unit_tests/TestDynamicView.hpp +++ b/lib/kokkos/containers/unit_tests/TestDynamicView.hpp @@ -71,7 +71,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -85,7 +84,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -93,7 +91,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -108,7 +105,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -123,7 +119,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -137,7 +132,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // add 3x more entries i.e. 4x larger than previous size // the first 1/4 should remain the same @@ -145,7 +139,6 @@ struct TestDynamicView { da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(da_size, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -160,7 +153,6 @@ struct TestDynamicView { ASSERT_EQ(new_result_sum + result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Create DynamicView, initialize size (via resize), run through @@ -175,7 +167,6 @@ struct TestDynamicView { da.resize_serial(da_size); ASSERT_EQ(da.size(), da_size); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -189,14 +180,12 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // remove the final 3/4 entries i.e. first 1/4 remain unsigned da_resize = arg_total_size / 8; da.resize_serial(da_resize); ASSERT_EQ(da.size(), da_resize); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_resize), KOKKOS_LAMBDA(const int i) { da(i) = Scalar(i); }); @@ -210,7 +199,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_resize * (da_resize - 1) / 2)); -#endif } // end scope // Test: Reproducer to demonstrate compile-time error of deep_copy @@ -229,7 +217,6 @@ struct TestDynamicView { device_dynamic_view.resize_serial(da_size); // Use parallel_for to populate device_dynamic_view and verify values -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(i); }); @@ -243,7 +230,6 @@ struct TestDynamicView { result_sum); ASSERT_EQ(result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif // Use an on-device View as intermediate to deep_copy the // device_dynamic_view to host, zero out the device_dynamic_view, @@ -251,13 +237,11 @@ struct TestDynamicView { Kokkos::deep_copy(device_view, device_dynamic_view); Kokkos::deep_copy(host_view, device_view); Kokkos::deep_copy(device_view, host_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) Kokkos::parallel_for( Kokkos::RangePolicy(0, da_size), KOKKOS_LAMBDA(const int i) { device_dynamic_view(i) = Scalar(0); }); -#endif Kokkos::deep_copy(device_dynamic_view, device_view); -#if defined(KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA) + value_type new_result_sum = 0.0; Kokkos::parallel_reduce( Kokkos::RangePolicy(0, da_size), @@ -267,21 +251,6 @@ struct TestDynamicView { new_result_sum); ASSERT_EQ(new_result_sum, (value_type)(da_size * (da_size - 1) / 2)); -#endif - - // Try to deep_copy device_dynamic_view directly to/from host. - // host-to-device currently fails to compile because DP and SP are - // swapped in the deep_copy implementation. - // Once that's fixed, both deep_copy's will fail at runtime because the - // destination execution space cannot access the source memory space. - // Check if the memory spaces are different before testing the deep_copy. - if (!Kokkos::SpaceAccessibility::accessible) { - ASSERT_THROW(Kokkos::deep_copy(host_view, device_dynamic_view), - std::runtime_error); - ASSERT_THROW(Kokkos::deep_copy(device_dynamic_view, host_view), - std::runtime_error); - } } } }; diff --git a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp index 0003a29468..4ebab889c7 100644 --- a/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp +++ b/lib/kokkos/containers/unit_tests/TestErrorReporter.hpp @@ -149,7 +149,6 @@ struct ErrorReporterDriver : public ErrorReporterDriverBase { } }; -#if !defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA) template struct ErrorReporterDriverUseLambda : public ErrorReporterDriverBase { @@ -178,7 +177,6 @@ struct ErrorReporterDriverUseLambda driver_base::check_expectations(reporter_capacity, test_size); } }; -#endif #ifdef KOKKOS_ENABLE_OPENMP struct ErrorReporterDriverNativeOpenMP @@ -205,8 +203,7 @@ struct ErrorReporterDriverNativeOpenMP // FIXME_MSVC MSVC just gets confused when using the base class in the // KOKKOS_CLASS_LAMBDA -#if !defined(KOKKOS_COMPILER_MSVC) && \ - (!defined(KOKKOS_ENABLE_CUDA) || defined(KOKKOS_ENABLE_CUDA_LAMBDA)) +#ifndef KOKKOS_COMPILER_MSVC TEST(TEST_CATEGORY, ErrorReporterViaLambda) { TestErrorReporter>(); } diff --git a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp index c133922e3d..706b40fff3 100644 --- a/lib/kokkos/containers/unit_tests/TestOffsetView.hpp +++ b/lib/kokkos/containers/unit_tests/TestOffsetView.hpp @@ -56,7 +56,18 @@ void test_offsetview_construction() { offset_view_type ov("firstOV", range0, range1); ASSERT_EQ("firstOV", ov.label()); - ASSERT_EQ(2, ov.Rank); + +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + ASSERT_EQ(2u, ov.Rank); +#endif +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS + KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + + ASSERT_EQ(2u, ov.rank()); ASSERT_EQ(ov.begin(0), -1); ASSERT_EQ(ov.end(0), 4); @@ -67,7 +78,6 @@ void test_offsetview_construction() { ASSERT_EQ(ov.extent(0), 5u); ASSERT_EQ(ov.extent(1), 5u); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) { Kokkos::Experimental::OffsetView offsetV1("OneDOffsetView", range0); @@ -149,7 +159,6 @@ void test_offsetview_construction() { } ASSERT_EQ(OVResult, answer) << "Bad data found in OffsetView"; -#endif { offset_view_type ovCopy(ov); @@ -184,7 +193,6 @@ void test_offsetview_construction() { range3_type rangePolicy3DZero(point3_type{{0, 0, 0}}, point3_type{{extent0, extent1, extent2}}); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int view3DSum = 0; Kokkos::parallel_reduce( rangePolicy3DZero, @@ -207,7 +215,6 @@ void test_offsetview_construction() { ASSERT_EQ(view3DSum, offsetView3DSum) << "construction of OffsetView from View and begins array broken."; -#endif } view_type viewFromOV = ov.view(); @@ -232,7 +239,6 @@ void test_offsetview_construction() { view_type aView("aView", ov.extent(0), ov.extent(1)); Kokkos::deep_copy(aView, ov); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -242,7 +248,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(view, offsetView) broken."; -#endif } { // test view to offsetview deep copy @@ -251,7 +256,6 @@ void test_offsetview_construction() { Kokkos::deep_copy(aView, 99); Kokkos::deep_copy(ov, aView); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) int sum = 0; Kokkos::parallel_reduce( rangePolicy2D, @@ -261,7 +265,6 @@ void test_offsetview_construction() { sum); ASSERT_EQ(sum, 0) << "deep_copy(offsetView, view) broken."; -#endif } } @@ -329,46 +332,131 @@ void test_offsetview_unmanaged_construction() { ASSERT_EQ(bb, ib); ASSERT_EQ(bb, ii); } +} + +template +void test_offsetview_unmanaged_construction_death() { + // Preallocated memory (Only need a valid address for this test) + Scalar s; + + // Regular expression syntax on Windows is a pain. `.` does not match `\n`. + // Feel free to make it work if you have time to spare. +#ifdef _WIN32 +#define SKIP_REGEX_ON_WINDOWS(REGEX) "" +#else +#define SKIP_REGEX_ON_WINDOWS(REGEX) REGEX +#endif { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must be positive - ASSERT_NO_THROW(offset_view_type(&s, {0}, {1})); - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0})); - ASSERT_THROW(offset_view_type(&s, {0}, {-1}), std::runtime_error); + (void)offset_view_type(&s, {0}, {1}); + (void)offset_view_type(&s, {0}, {0}); + ASSERT_DEATH( + offset_view_type(&s, {0}, {-1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(-1\\) - begins\\[0\\] \\(0\\)\\) must be " + "non-negative")); } { using offset_view_type = Kokkos::Experimental::OffsetView; // Range calculations must not overflow - ASSERT_NO_THROW(offset_view_type(&s, {0}, {0x7fffffffffffffffl})); - ASSERT_THROW(offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW( + (void)offset_view_type(&s, {0}, {0x7fffffffffffffffl}); + ASSERT_DEATH( + offset_view_type(&s, {-1}, {0x7fffffffffffffffl}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-1\\)\\) " + "overflows")); + ASSERT_DEATH( offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0x7fffffffffffffffl}), - std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), - std::runtime_error); + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(9223372036854775807\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); + ASSERT_DEATH( + offset_view_type(&s, {-0x7fffffffffffffffl - 1}, {0}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "\\(ends\\[0\\] \\(0\\) - begins\\[0\\] " + "\\(-9223372036854775808\\)\\) " + "overflows")); } { using offset_view_type = Kokkos::Experimental::OffsetView; - // Should throw when the rank of begins and/or ends doesn't match that of - // OffsetView - ASSERT_THROW(offset_view_type(&s, {0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1}), std::runtime_error); - ASSERT_NO_THROW(offset_view_type(&s, {0, 0}, {1, 1})); - ASSERT_THROW(offset_view_type(&s, {0, 0}, {1, 1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1}), std::runtime_error); - ASSERT_THROW(offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), - std::runtime_error); + // Should throw when the rank of begins and/or ends doesn't match that + // of OffsetView + ASSERT_DEATH( + offset_view_type(&s, {0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(1\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + (void)offset_view_type(&s, {0, 0}, {1, 1}); + ASSERT_DEATH( + offset_view_type(&s, {0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(1\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); + ASSERT_DEATH( + offset_view_type(&s, {0, 0, 0}, {1, 1, 1}), + SKIP_REGEX_ON_WINDOWS( + "Kokkos::Experimental::OffsetView ERROR: for unmanaged OffsetView" + ".*" + "begins\\.size\\(\\) \\(3\\) != Rank \\(2\\)" + ".*" + "ends\\.size\\(\\) \\(3\\) != Rank \\(2\\)")); } +#undef SKIP_REGEX_ON_WINDOWS } template @@ -377,8 +465,8 @@ void test_offsetview_subview() { Kokkos::Experimental::OffsetView sliceMe("offsetToSlice", {-10, 20}); { - auto offsetSubviewa = Kokkos::Experimental::subview(sliceMe, 0); - ASSERT_EQ(offsetSubviewa.Rank, 0) << "subview of offset is broken."; + auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0); + ASSERT_EQ(offsetSubview.rank(), 0u) << "subview of offset is broken."; } } { // test subview 2 @@ -387,13 +475,13 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), -2); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -406,30 +494,29 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::make_pair(-30, -21)); - ASSERT_EQ(offsetSubview.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 2u) << "subview of offset is broken."; ASSERT_EQ(offsetSubview.begin(0), -20); ASSERT_EQ(offsetSubview.end(0), 31); ASSERT_EQ(offsetSubview.begin(1), 0); ASSERT_EQ(offsetSubview.end(1), 9); -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) using range_type = Kokkos::MDRangePolicy, Kokkos::IndexType >; using point_type = typename range_type::point_type; @@ -455,25 +542,24 @@ void test_offsetview_subview() { sum); ASSERT_EQ(sum, 6 * (e0 - b0) * (e1 - b1)); -#endif } // slice 2 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } @@ -486,73 +572,72 @@ void test_offsetview_subview() { { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 3) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 3u) << "subview of offset is broken."; } // slice 2 auto offsetSubview2a = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview2a.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2a.rank(), 2u) << "subview of offset is broken."; { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, Kokkos::ALL(), 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, Kokkos::ALL(), 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } { auto offsetSubview2b = Kokkos::Experimental::subview( sliceMe, 0, 0, Kokkos::ALL(), Kokkos::ALL()); - ASSERT_EQ(offsetSubview2b.Rank, 2) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview2b.rank(), 2u) << "subview of offset is broken."; } // slice 3 { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, Kokkos::ALL(), 0, 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, Kokkos::ALL(), 0, 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, Kokkos::ALL(), 0); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } { auto offsetSubview = Kokkos::Experimental::subview(sliceMe, 0, 0, 0, Kokkos::ALL()); - ASSERT_EQ(offsetSubview.Rank, 1) << "subview of offset is broken."; + ASSERT_EQ(offsetSubview.rank(), 1u) << "subview of offset is broken."; } } } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) template KOKKOS_INLINE_FUNCTION T std_accumulate(InputIt first, InputIt last, T init, BinaryOperation op) { @@ -586,6 +671,7 @@ void test_offsetview_offsets_rank1() { KOKKOS_LAMBDA(const int ii, int& lerrors) { offset_view_type ov(v, {ii}); lerrors += (ov(3) != element({3 - ii})); + lerrors += (ov[3] != element({3 - ii})); }, errors); @@ -655,7 +741,6 @@ void test_offsetview_offsets_rank3() { ASSERT_EQ(0, errors); } -#endif TEST(TEST_CATEGORY, offsetview_construction) { test_offsetview_construction(); @@ -665,11 +750,15 @@ TEST(TEST_CATEGORY, offsetview_unmanaged_construction) { test_offsetview_unmanaged_construction(); } +TEST(TEST_CATEGORY_DEATH, offsetview_unmanaged_construction) { + ::testing::FLAGS_gtest_death_test_style = "threadsafe"; + test_offsetview_unmanaged_construction_death(); +} + TEST(TEST_CATEGORY, offsetview_subview) { test_offsetview_subview(); } -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) TEST(TEST_CATEGORY, offsetview_offsets_rank1) { test_offsetview_offsets_rank1(); } @@ -681,7 +770,6 @@ TEST(TEST_CATEGORY, offsetview_offsets_rank2) { TEST(TEST_CATEGORY, offsetview_offsets_rank3) { test_offsetview_offsets_rank3(); } -#endif } // namespace Test diff --git a/lib/kokkos/containers/unit_tests/TestScatterView.hpp b/lib/kokkos/containers/unit_tests/TestScatterView.hpp index 733f43122c..72c1afbe96 100644 --- a/lib/kokkos/containers/unit_tests/TestScatterView.hpp +++ b/lib/kokkos/containers/unit_tests/TestScatterView.hpp @@ -33,11 +33,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -134,11 +134,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -235,11 +235,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -335,11 +335,11 @@ struct test_scatter_view_impl_cls { public: using scatter_view_type = - Kokkos::Experimental::ScatterView; - using orig_view_type = Kokkos::View; + using orig_view_type = Kokkos::View; using size_type = typename Kokkos::HostSpace::size_type; @@ -714,7 +714,7 @@ void test_scatter_view(int64_t n) { test_sv_config.run_test(n); } #ifdef KOKKOS_ENABLE_SERIAL - if (!std::is_same::value) { + if (!std::is_same_v) { #endif test_scatter_view_config::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); - ASSERT_TRUE((std::is_same::value)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); + ASSERT_TRUE((std::is_same_v)); } } /* namespace TestStaticCrsGraph */ diff --git a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp index 4a7e826ecb..fc7435a75e 100644 --- a/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp +++ b/lib/kokkos/containers/unit_tests/TestUnorderedMap.hpp @@ -460,7 +460,7 @@ struct UnorderedMapInsert { //! Insert multiple values. template - void insert(Args &&... args) const { + void insert(Args &&...args) const { static_assert(sizeof...(Args) > 1, "Prefer the single value version"); constexpr size_t size = sizeof...(Args); Kokkos::Array values{ @@ -534,8 +534,6 @@ TEST(TEST_CATEGORY, UnorderedMap_shallow_copyable_on_device) { ASSERT_EQ(1u, test_map_copy.m_map.size()); } -#if !defined(KOKKOS_ENABLE_CUDA) || \ - (defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_CUDA_LAMBDA)) void test_unordered_map_device_capture() { TestMapCopy::map_type map; @@ -549,7 +547,6 @@ void test_unordered_map_device_capture() { TEST(TEST_CATEGORY, UnorderedMap_lambda_capturable) { test_unordered_map_device_capture(); } -#endif /** * @test This test ensures that an @ref UnorderedMap can be built diff --git a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp index 0246f11ddf..2edddcce34 100644 --- a/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp +++ b/lib/kokkos/containers/unit_tests/TestViewCtorPropEmbeddedDim.hpp @@ -48,7 +48,7 @@ struct TestViewCtorProp_EmbeddedDim { void operator()(const int i) const { v(i) = i; } }; - static void test_vcpt(const int N0, const int N1) { + static void test_vcpt(const size_t N0, const size_t N1) { // Create two views to test { using VIT = typename TestViewCtorProp_EmbeddedDim::ViewIntType; @@ -78,16 +78,16 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); #if 0 // debug output - for ( int i = 0; i < N0*N1; ++i ) { - printf(" Output check: hcv1(%d) = %lf\n ", i, hcv1(i) ); + for ( size_t i = 0; i < N0*N1; ++i ) { + printf(" Output check: hcv1(%zu) = %lf\n ", i, hcv1(i) ); } printf( " Common value type view: %s \n", typeid( CVT() ).name() ); printf( " Common value type: %s \n", typeid( CommonViewValueType() ).name() ); - if ( std::is_same< CommonViewValueType, double >::value == true ) { + if ( std::is_same_v< CommonViewValueType, double > == true ) { printf("Proper common value_type\n"); } else { @@ -115,7 +115,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } @@ -148,7 +148,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } { @@ -169,7 +169,7 @@ struct TestViewCtorProp_EmbeddedDim { HostCVT hcv1 = Kokkos::create_mirror_view(cv1); Kokkos::deep_copy(hcv1, cv1); - ASSERT_EQ((std::is_same::value), true); + ASSERT_EQ((std::is_same_v), true); } } diff --git a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp index e8558628dc..2932898554 100644 --- a/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp +++ b/lib/kokkos/containers/unit_tests/TestWithoutInitializing.hpp @@ -44,6 +44,12 @@ Kokkos::CudaSpace>) \ GTEST_SKIP() << "skipping since unified memory requires additional " \ "fences"; +#elif defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) +#define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE \ + if constexpr (std::is_same_v) \ + GTEST_SKIP() << "skipping since unified memory requires additional " \ + "fences"; #else #define GTEST_SKIP_IF_UNIFIED_MEMORY_SPACE #endif @@ -51,8 +57,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 5, 6, 7, - 8); + Kokkos::DualView bla("bla", 5, 6, 7, 8); auto success = validate_absence( [&]() { @@ -82,8 +87,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -112,8 +116,7 @@ TEST(TEST_CATEGORY, resize_exec_space_dualview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); - Kokkos::DualView bla("bla", 8, 7, 6, - 5); + Kokkos::DualView bla("bla", 8, 7, 6, 5); auto success = validate_absence( [&]() { @@ -245,7 +248,7 @@ TEST(TEST_CATEGORY, realloc_exec_space_dynrankview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif @@ -280,7 +283,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_init_scatterview) { using namespace Kokkos::Test::Tools; listen_tool_events(Config::DisableAll(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 4, 5, 6, 7); auto success = validate_absence( @@ -312,7 +315,7 @@ TEST(TEST_CATEGORY, resize_realloc_no_alloc_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableKernels(), Config::EnableAllocs()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -343,7 +346,7 @@ TEST(TEST_CATEGORY, resize_exec_space_scatterview) { listen_tool_events(Config::DisableAll(), Config::EnableFences(), Config::EnableKernels()); Kokkos::Experimental::ScatterView< - int*** * [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> + int**** [1][2][3], typename TEST_EXECSPACE::array_layout, TEST_EXECSPACE> bla("bla", 7, 6, 5, 4); auto success = validate_absence( @@ -384,13 +387,12 @@ TEST(TEST_CATEGORY, realloc_exec_space_scatterview) { // FIXME_THREADS The Threads backend fences every parallel_for #ifdef KOKKOS_ENABLE_THREADS - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the Threads backend isn't asynchronous"; #endif #if defined(KOKKOS_ENABLE_HPX) && \ !defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) - if (std::is_same::value) + if (std::is_same_v) GTEST_SKIP() << "skipping since the HPX backend always fences with async " "dispatch disabled"; #endif diff --git a/lib/kokkos/core/CMakeLists.txt b/lib/kokkos/core/CMakeLists.txt index 0917928001..21f05f6272 100644 --- a/lib/kokkos/core/CMakeLists.txt +++ b/lib/kokkos/core/CMakeLists.txt @@ -1,22 +1,14 @@ -IF (NOT Kokkos_INSTALL_TESTING) - ADD_SUBDIRECTORY(src) -ENDIF() +if(NOT Kokkos_INSTALL_TESTING) + add_subdirectory(src) +endif() -FUNCTION(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) - IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() - ENDIF() +function(KOKKOS_ADD_BENCHMARK_DIRECTORY DIR_NAME) + if(NOT Kokkos_ENABLE_BENCHMARKS) + return() + endif() - IF(KOKKOS_HAS_TRILINOS) - message( - STATUS - "Benchmarks are not supported when building as part of Trilinos" - ) - RETURN() - ENDIF() + add_subdirectory(${DIR_NAME}) +endfunction() - ADD_SUBDIRECTORY(${DIR_NAME}) -ENDFUNCTION() - -KOKKOS_ADD_TEST_DIRECTORIES(unit_test) -KOKKOS_ADD_BENCHMARK_DIRECTORY(perf_test) +kokkos_add_test_directories(unit_test) +kokkos_add_benchmark_directory(perf_test) diff --git a/lib/kokkos/core/perf_test/CMakeLists.txt b/lib/kokkos/core/perf_test/CMakeLists.txt index e0dba03e1e..0cb2c804d3 100644 --- a/lib/kokkos/core/perf_test/CMakeLists.txt +++ b/lib/kokkos/core/perf_test/CMakeLists.txt @@ -1,50 +1,36 @@ # FIXME_OPENMPTARGET - the NVIDIA HPC compiler nvc++ in the OpenMPTarget backend does not pass the perf_tests. # FIXME_OPENACC - temporarily disabled due to unimplemented features -IF ((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - RETURN() -ENDIF() -IF (KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) - RETURN() -ENDIF() +if((KOKKOS_ENABLE_OPENMPTARGET OR KOKKOS_ENABLE_OPENACC) AND KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + return() +endif() +if(KOKKOS_ENABLE_OPENACC AND KOKKOS_CXX_COMPILER_ID STREQUAL Clang) + return() +endif() # all PerformanceTest_* executables are part of regular tests # TODO: finish converting these into benchmarks (in progress) -IF(KOKKOS_ENABLE_TESTS) - IF(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) - KOKKOS_ADD_EXECUTABLE ( - PerformanceTest_SharedSpace - SOURCES test_sharedSpace.cpp - ) - ENDIF() +if(KOKKOS_ENABLE_TESTS) + if(KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_HIP OR KOKKOS_ENABLE_SYCL) + kokkos_add_executable(PerformanceTest_SharedSpace SOURCES test_sharedSpace.cpp) + endif() - KOKKOS_INCLUDE_DIRECTORIES(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) + kokkos_include_directories(REQUIRED_DURING_INSTALLATION_TESTING ${CMAKE_CURRENT_SOURCE_DIR}) - IF(NOT Kokkos_ENABLE_OPENMPTARGET) - # FIXME OPENMPTARGET needs tasking - KOKKOS_ADD_EXECUTABLE_AND_TEST( - PerformanceTest_TaskDag - SOURCES test_taskdag.cpp - CATEGORIES PERFORMANCE - ) - ENDIF() -ENDIF() + kokkos_add_executable_and_test(PerformanceTest_TaskDag SOURCES test_taskdag.cpp CATEGORIES PERFORMANCE) +endif() -IF(NOT Kokkos_ENABLE_BENCHMARKS) - RETURN() -ENDIF() - -IF (KOKKOS_HAS_TRILINOS) - message(FATAL_ERROR "Benchmarks are not supported when building as part of Trilinos") -ENDIF() +if(NOT Kokkos_ENABLE_BENCHMARKS) + return() +endif() # Find or download google/benchmark library find_package(benchmark QUIET 1.5.6) -IF(benchmark_FOUND) - MESSAGE(STATUS "Using google benchmark found in ${benchmark_DIR}") -ELSE() +if(benchmark_FOUND) + message(STATUS "Using google benchmark found in ${benchmark_DIR}") +else() message(STATUS "No installed google benchmark found, fetching from GitHub") include(FetchContent) - SET(BENCHMARK_ENABLE_TESTING OFF) + set(BENCHMARK_ENABLE_TESTING OFF) list(APPEND CMAKE_MESSAGE_INDENT "[benchmark] ") FetchContent_Declare( @@ -57,143 +43,93 @@ ELSE() list(POP_BACK CMAKE_MESSAGE_INDENT) # Suppress clang-tidy diagnostics on code that we do not have control over - IF(CMAKE_CXX_CLANG_TIDY) - SET_TARGET_PROPERTIES(benchmark PROPERTIES CXX_CLANG_TIDY "") - ENDIF() + if(CMAKE_CXX_CLANG_TIDY) + set_target_properties(benchmark PROPERTIES CXX_CLANG_TIDY "") + endif() target_compile_options(benchmark PRIVATE -w) target_compile_options(benchmark_main PRIVATE -w) -ENDIF() +endif() +function(KOKKOS_ADD_BENCHMARK NAME) + cmake_parse_arguments(BENCHMARK "" "" "SOURCES" ${ARGN}) + if(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) + message(WARNING "Unexpected arguments when adding a benchmark: " ${BENCHMARK_UNPARSED_ARGUMENTS}) + endif() -FUNCTION(KOKKOS_ADD_BENCHMARK NAME) - CMAKE_PARSE_ARGUMENTS( - BENCHMARK - "" - "" - "SOURCES" - ${ARGN} - ) - IF(DEFINED BENCHMARK_UNPARSED_ARGUMENTS) - MESSAGE( - WARNING - "Unexpected arguments when adding a benchmark: " - ${BENCHMARK_UNPARSED_ARGUMENTS} - ) - ENDIF() + set(BENCHMARK_NAME Kokkos_${NAME}) + list(APPEND BENCHMARK_SOURCES BenchmarkMain.cpp Benchmark_Context.cpp) - SET(BENCHMARK_NAME ${PACKAGE_NAME}_${NAME}) - LIST(APPEND BENCHMARK_SOURCES - BenchmarkMain.cpp - Benchmark_Context.cpp - ) + add_executable(${BENCHMARK_NAME} ${BENCHMARK_SOURCES}) + target_link_libraries(${BENCHMARK_NAME} PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version) + target_include_directories(${BENCHMARK_NAME} SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include) - ADD_EXECUTABLE( - ${BENCHMARK_NAME} - ${BENCHMARK_SOURCES} - ) - TARGET_LINK_LIBRARIES( - ${BENCHMARK_NAME} - PRIVATE benchmark::benchmark Kokkos::kokkos impl_git_version - ) - TARGET_INCLUDE_DIRECTORIES( - ${BENCHMARK_NAME} - SYSTEM PRIVATE ${benchmark_SOURCE_DIR}/include - ) + foreach(SOURCE_FILE ${BENCHMARK_SOURCES}) + set_source_files_properties(${SOURCE_FILE} PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE}) + endforeach() - FOREACH(SOURCE_FILE ${BENCHMARK_SOURCES}) - SET_SOURCE_FILES_PROPERTIES( - ${SOURCE_FILE} - PROPERTIES LANGUAGE ${KOKKOS_COMPILE_LANGUAGE} - ) - ENDFOREACH() + string(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) + set(BENCHMARK_ARGS --benchmark_counters_tabular=true --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json) - STRING(TIMESTAMP BENCHMARK_TIME "%Y-%m-%d_T%H-%M-%S" UTC) - SET( - BENCHMARK_ARGS - --benchmark_counters_tabular=true - --benchmark_out=${BENCHMARK_NAME}_${BENCHMARK_TIME}.json - ) + add_test(NAME ${BENCHMARK_NAME} COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS}) +endfunction() - ADD_TEST( - NAME ${BENCHMARK_NAME} - COMMAND ${BENCHMARK_NAME} ${BENCHMARK_ARGS} - ) -ENDFUNCTION() - -SET( - BENCHMARK_SOURCES - PerfTestGramSchmidt.cpp - PerfTest_CustomReduction.cpp - PerfTest_ExecSpacePartitioning.cpp - PerfTestHexGrad.cpp - PerfTest_MallocFree.cpp - PerfTest_ViewAllocate.cpp - PerfTest_ViewCopy_a123.cpp - PerfTest_ViewCopy_b123.cpp - PerfTest_ViewCopy_c123.cpp - PerfTest_ViewCopy_d123.cpp - PerfTest_ViewCopy_a45.cpp - PerfTest_ViewCopy_b45.cpp - PerfTest_ViewCopy_c45.cpp - PerfTest_ViewCopy_d45.cpp - PerfTest_ViewCopy_a6.cpp - PerfTest_ViewCopy_b6.cpp - PerfTest_ViewCopy_c6.cpp - PerfTest_ViewCopy_d6.cpp - PerfTest_ViewCopy_a7.cpp - PerfTest_ViewCopy_b7.cpp - PerfTest_ViewCopy_c7.cpp - PerfTest_ViewCopy_d7.cpp - PerfTest_ViewCopy_a8.cpp - PerfTest_ViewCopy_b8.cpp - PerfTest_ViewCopy_c8.cpp - PerfTest_ViewCopy_d8.cpp - PerfTest_ViewCopy_Raw.cpp - PerfTest_ViewFill_123.cpp - PerfTest_ViewFill_45.cpp - PerfTest_ViewFill_6.cpp - PerfTest_ViewFill_7.cpp - PerfTest_ViewFill_8.cpp - PerfTest_ViewFill_Raw.cpp - PerfTest_ViewResize_123.cpp - PerfTest_ViewResize_45.cpp - PerfTest_ViewResize_6.cpp - PerfTest_ViewResize_7.cpp - PerfTest_ViewResize_8.cpp - PerfTest_ViewResize_Raw.cpp -) - -IF(Kokkos_ENABLE_OPENMPTARGET) -# FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction - LIST(REMOVE_ITEM BENCHMARK_SOURCES +set(BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp PerfTest_ExecSpacePartitioning.cpp - ) -ENDIF() - -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Benchmark - SOURCES ${BENCHMARK_SOURCES} + PerfTestHexGrad.cpp + PerfTest_MallocFree.cpp + PerfTest_ViewAllocate.cpp + PerfTest_ViewCopy_a123.cpp + PerfTest_ViewCopy_b123.cpp + PerfTest_ViewCopy_c123.cpp + PerfTest_ViewCopy_d123.cpp + PerfTest_ViewCopy_a45.cpp + PerfTest_ViewCopy_b45.cpp + PerfTest_ViewCopy_c45.cpp + PerfTest_ViewCopy_d45.cpp + PerfTest_ViewCopy_a6.cpp + PerfTest_ViewCopy_b6.cpp + PerfTest_ViewCopy_c6.cpp + PerfTest_ViewCopy_d6.cpp + PerfTest_ViewCopy_a7.cpp + PerfTest_ViewCopy_b7.cpp + PerfTest_ViewCopy_c7.cpp + PerfTest_ViewCopy_d7.cpp + PerfTest_ViewCopy_a8.cpp + PerfTest_ViewCopy_b8.cpp + PerfTest_ViewCopy_c8.cpp + PerfTest_ViewCopy_d8.cpp + PerfTest_ViewCopy_Raw.cpp + PerfTest_ViewFill_123.cpp + PerfTest_ViewFill_45.cpp + PerfTest_ViewFill_6.cpp + PerfTest_ViewFill_7.cpp + PerfTest_ViewFill_8.cpp + PerfTest_ViewFill_Raw.cpp + PerfTest_ViewResize_123.cpp + PerfTest_ViewResize_45.cpp + PerfTest_ViewResize_6.cpp + PerfTest_ViewResize_7.cpp + PerfTest_ViewResize_8.cpp + PerfTest_ViewResize_Raw.cpp ) -IF(NOT KOKKOS_ENABLE_CUDA OR KOKKOS_ENABLE_CUDA_LAMBDA) - KOKKOS_ADD_BENCHMARK( - Benchmark_Atomic_MinMax - SOURCES test_atomic_minmax_simple.cpp +if(Kokkos_ENABLE_OPENMPTARGET) + # FIXME OPENMPTARGET requires TeamPolicy Reductions and Custom Reduction + list(REMOVE_ITEM BENCHMARK_SOURCES PerfTestGramSchmidt.cpp PerfTest_CustomReduction.cpp + PerfTest_ExecSpacePartitioning.cpp ) -ENDIF() +endif() + +kokkos_add_benchmark(PerformanceTest_Benchmark SOURCES ${BENCHMARK_SOURCES}) + +kokkos_add_benchmark(Benchmark_Atomic_MinMax SOURCES test_atomic_minmax_simple.cpp) # FIXME_NVHPC -IF(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) - KOKKOS_ADD_BENCHMARK( - PerformanceTest_Mempool - SOURCES test_mempool.cpp - ) -ENDIF() +if(NOT KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + kokkos_add_benchmark(PerformanceTest_Mempool SOURCES test_mempool.cpp) +endif() -KOKKOS_ADD_BENCHMARK( - PerformanceTest_Atomic - SOURCES test_atomic.cpp -) +kokkos_add_benchmark(PerformanceTest_Atomic SOURCES test_atomic.cpp) diff --git a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp index 98cb246c71..1ebe750f21 100644 --- a/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp +++ b/lib/kokkos/core/perf_test/PerfTestHexGrad.cpp @@ -34,10 +34,10 @@ struct HexGrad { enum { NSpace = 3, NNode = 8 }; using elem_coord_type = - Kokkos::View; + Kokkos::View; using elem_grad_type = - Kokkos::View; + Kokkos::View; elem_coord_type coords; elem_grad_type grad_op; diff --git a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp index 2110f38a91..03340a5d6d 100644 --- a/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_CustomReduction.cpp @@ -21,7 +21,6 @@ #include #include -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA namespace Test { template std::pair custom_reduction_test(int N, int R) { @@ -130,4 +129,3 @@ BENCHMARK(CustomReduction) ->UseManualTime(); } // namespace Test -#endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp index d2a3d0b823..aa23ddbb60 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ExecSpacePartitioning.cpp @@ -56,8 +56,7 @@ bool is_overlapping(const Kokkos::HIP&) { #if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) template <> -bool is_overlapping( - const Kokkos::Experimental::SYCL&) { +bool is_overlapping(const Kokkos::SYCL&) { return true; } #endif diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp index 67a8d7e555..e4db40e128 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewCopy_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) @@ -38,6 +37,5 @@ BENCHMARK(ViewDeepCopy_Raw) ->ArgName("N") ->Arg(10) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp index c11074d915..57bba83a9c 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewFill_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) @@ -28,6 +27,5 @@ BENCHMARK(ViewFill_Raw) ->ArgName("N") ->Arg(N) ->UseManualTime(); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp index 2d1bcbb3ca..ab469cb647 100644 --- a/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp +++ b/lib/kokkos/core/perf_test/PerfTest_ViewResize_Raw.cpp @@ -18,7 +18,6 @@ namespace Test { -#if defined(KOKKOS_ENABLE_CUDA_LAMBDA) || !defined(KOKKOS_ENABLE_CUDA) BENCHMARK(ViewResize_NoInit_Raw) ->ArgName("N") ->Arg(N) @@ -30,6 +29,5 @@ BENCHMARK(ViewResize_NoInit_Raw) ->Arg(N) ->UseManualTime() ->Iterations(R); -#endif } // namespace Test diff --git a/lib/kokkos/core/perf_test/test_mempool.cpp b/lib/kokkos/core/perf_test/test_mempool.cpp index 9905740afb..bdfe59b0b3 100644 --- a/lib/kokkos/core/perf_test/test_mempool.cpp +++ b/lib/kokkos/core/perf_test/test_mempool.cpp @@ -198,7 +198,7 @@ static void Mempool_Fill(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, @@ -225,7 +225,7 @@ static void Mempool_Alloc_Dealloc(benchmark::State& state) { int fill_level = get_parameter("--fill_level=", state.range(4)); int repeat_inner = get_parameter("--repeat_inner=", state.range(5)); int number_alloc = get_number_alloc(chunk_span, min_superblock_size, - total_alloc_size, fill_level); + total_alloc_size, fill_level); for (auto _ : state) { TestFunctor functor(total_alloc_size, min_superblock_size, number_alloc, diff --git a/lib/kokkos/core/perf_test/test_sharedSpace.cpp b/lib/kokkos/core/perf_test/test_sharedSpace.cpp index 4f140c9409..3c06770e28 100644 --- a/lib/kokkos/core/perf_test/test_sharedSpace.cpp +++ b/lib/kokkos/core/perf_test/test_sharedSpace.cpp @@ -103,7 +103,7 @@ size_t getDeviceMemorySize() { #elif defined KOKKOS_ENABLE_HIP return Kokkos::HIP{}.hip_device_prop().totalGlobalMem; #elif defined KOKKOS_ENABLE_SYCL - auto device = Kokkos::Experimental::SYCL{}.sycl_queue().get_device(); + auto device = Kokkos::SYCL{}.sycl_queue().get_device(); return device.get_info(); #else #error \ diff --git a/lib/kokkos/core/perf_test/test_taskdag.cpp b/lib/kokkos/core/perf_test/test_taskdag.cpp index fccaab64dd..347d9748b5 100644 --- a/lib/kokkos/core/perf_test/test_taskdag.cpp +++ b/lib/kokkos/core/perf_test/test_taskdag.cpp @@ -32,6 +32,11 @@ int main() { return 0; } #include +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + using ExecSpace = Kokkos::DefaultExecutionSpace; inline long eval_fib(long n) { @@ -223,4 +228,8 @@ int main(int argc, char* argv[]) { return 0; } +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + #endif diff --git a/lib/kokkos/core/src/CMakeLists.txt b/lib/kokkos/core/src/CMakeLists.txt index b84677e61b..72663739a1 100644 --- a/lib/kokkos/core/src/CMakeLists.txt +++ b/lib/kokkos/core/src/CMakeLists.txt @@ -1,118 +1,125 @@ -KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} - ${KOKKOS_TOP_BUILD_DIR} -) -IF (NOT desul_FOUND) - IF(KOKKOS_ENABLE_CUDA) - SET(DESUL_ATOMICS_ENABLE_CUDA ON) - ENDIF() - IF(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP) - SET(DESUL_ATOMICS_ENABLE_HIP ON) - ENDIF() - IF(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) - SET(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) - ENDIF() - IF(KOKKOS_ENABLE_SYCL) - SET(DESUL_ATOMICS_ENABLE_SYCL ON) - IF(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) - SET(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) - ENDIF() - ENDIF() - IF(KOKKOS_ENABLE_OPENMPTARGET) - SET(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP - ENDIF() - IF(KOKKOS_ENABLE_OPENACC) - SET(DESUL_ATOMICS_ENABLE_OPENACC ON) - ENDIF() - CONFIGURE_FILE( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/Config.hpp.cmake.in - ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp +kokkos_include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ${KOKKOS_TOP_BUILD_DIR}) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + set(DESUL_ATOMICS_ENABLE_CUDA ON) + endif() + if(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_CUDA_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_HIP) + set(DESUL_ATOMICS_ENABLE_HIP ON) + endif() + if(KOKKOS_ENABLE_HIP_RELOCATABLE_DEVICE_CODE) + set(DESUL_ATOMICS_ENABLE_HIP_SEPARABLE_COMPILATION ON) + endif() + if(KOKKOS_ENABLE_SYCL) + set(DESUL_ATOMICS_ENABLE_SYCL ON) + if(KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED AND NOT KOKKOS_IMPL_HAVE_SYCL_EXT_ONEAPI_DEVICE_GLOBAL) + set(DESUL_ATOMICS_ENABLE_SYCL_SEPARABLE_COMPILATION ON) + endif() + endif() + if(KOKKOS_ENABLE_OPENMPTARGET) + set(DESUL_ATOMICS_ENABLE_OPENMP ON) # not a typo Kokkos OpenMPTarget -> Desul OpenMP + endif() + if(KOKKOS_ENABLE_OPENACC) + # FIXME_OPENACC FIXME_CLACC - Below condition will be removed if Clacc can compile atomics. + if(KOKKOS_CXX_COMPILER_ID STREQUAL NVHPC) + set(DESUL_ATOMICS_ENABLE_OPENACC ON) + endif() + endif() + configure_file( + ${KOKKOS_SOURCE_DIR}/tpls/desul/Config.hpp.cmake.in ${CMAKE_CURRENT_BINARY_DIR}/desul/atomics/Config.hpp ) - KOKKOS_INCLUDE_DIRECTORIES( - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() + kokkos_include_directories(${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/" +install( + DIRECTORY "${CMAKE_CURRENT_SOURCE_DIR}/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.hpp" PATTERN "*.h" ) -SET(KOKKOS_CORE_SRCS) -APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) -SET(KOKKOS_CORE_HEADERS) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) -APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) +set(KOKKOS_CORE_SRCS) +append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.cpp) +set(KOKKOS_CORE_HEADERS) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/*.hpp) +append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/impl/*.hpp) -IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/Kokkos_Cuda_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Cuda/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENMP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENMP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/Kokkos_OpenMP_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMP/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENMPTARGET) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENMPTARGET) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenMPTarget/*.hpp) +endif() -IF (KOKKOS_ENABLE_OPENACC) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_OPENACC) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/OpenACC/*.hpp) +endif() -IF (KOKKOS_ENABLE_THREADS) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_THREADS) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Threads/*.hpp) +endif() -IF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HIP/*.hpp) +endif() -IF (KOKKOS_ENABLE_HPX) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_HPX) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/Kokkos_HPX_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/HPX/*.hpp) +endif() -IF (KOKKOS_ENABLE_SERIAL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_SERIAL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.cpp) + if(NOT Kokkos_ENABLE_DEPRECATED_CODE_4) + list(REMOVE_ITEM KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/Kokkos_Serial_Task.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/Serial/*.hpp) +endif() -IF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) -ENDIF() +if(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.cpp) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/SYCL/*.hpp) +endif() -IF (NOT desul_FOUND) - IF (KOKKOS_ENABLE_CUDA) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_CUDA.cpp) - ELSEIF (KOKKOS_ENABLE_HIP) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_HIP.cpp) - ELSEIF (KOKKOS_ENABLE_SYCL) - APPEND_GLOB(KOKKOS_CORE_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/src/Lock_Array_SYCL.cpp) - ENDIF() - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul/*/*/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/*/*/*.inc*) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) +if(NOT desul_FOUND) + if(KOKKOS_ENABLE_CUDA) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_CUDA.cpp) + elseif(KOKKOS_ENABLE_HIP) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_HIP.cpp) + elseif(KOKKOS_ENABLE_SYCL) + append_glob(KOKKOS_CORE_SRCS ${KOKKOS_SOURCE_DIR}/tpls/desul/src/Lock_Array_SYCL.cpp) + endif() + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul/*/*/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/desul/include/*/*/*.inc*) + append_glob(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_BINARY_DIR}/desul/*.hpp) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include/desul" - "${CMAKE_CURRENT_BINARY_DIR}/desul" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/desul/include/desul" "${CMAKE_CURRENT_BINARY_DIR}/desul" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "*.inc" @@ -120,33 +127,26 @@ IF (NOT desul_FOUND) PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal desul_atomics copy") -ELSE() - MESSAGE(STATUS "Using external desul_atomics install found at:") - MESSAGE(STATUS " " ${desul_DIR}) -ENDIF() + message(STATUS "Using internal desul_atomics copy") +else() + message(STATUS "Using external desul_atomics install found at:") + message(STATUS " " ${desul_DIR}) +endif() - -KOKKOS_ADD_LIBRARY( - kokkoscore - SOURCES ${KOKKOS_CORE_SRCS} - HEADERS ${KOKKOS_CORE_HEADERS} +kokkos_add_library( + kokkoscore SOURCES ${KOKKOS_CORE_SRCS} HEADERS ${KOKKOS_CORE_HEADERS} ADD_BUILD_OPTIONS # core should be given all the necessary compiler/linker flags ) -KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${KOKKOS_TOP_BUILD_DIR} - ${CMAKE_CURRENT_BINARY_DIR} - ${CMAKE_CURRENT_SOURCE_DIR} +kokkos_lib_include_directories( + kokkoscore ${KOKKOS_TOP_BUILD_DIR} ${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR} ) -IF (NOT desul_FOUND) - KOKKOS_LIB_INCLUDE_DIRECTORIES(kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/desul/include - ) -ENDIF() +if(NOT desul_FOUND) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/desul/include) +endif() -IF (Kokkos_ENABLE_IMPL_MDSPAN) - MESSAGE(STATUS "Experimental mdspan support is enabled") +if(Kokkos_ENABLE_IMPL_MDSPAN) + message(STATUS "Experimental mdspan support is enabled") # Some compilers now include mdspan... we just flag on their version # for now until we can get some compiler detection support @@ -154,62 +154,56 @@ IF (Kokkos_ENABLE_IMPL_MDSPAN) check_include_file_cxx(experimental/mdspan KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN) check_include_file_cxx(mdspan KOKKOS_COMPILER_SUPPORTS_MDSPAN) - if (Kokkos_ENABLE_MDSPAN_EXTERNAL) - MESSAGE(STATUS "Using external mdspan") + if(Kokkos_ENABLE_MDSPAN_EXTERNAL) + message(STATUS "Using external mdspan") target_link_libraries(kokkoscore PUBLIC std::mdspan) elseif(KOKKOS_COMPILER_SUPPORTS_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied mdspan") elseif(KOKKOS_COMPILER_SUPPORTS_EXPERIMENTAL_MDSPAN AND NOT Kokkos_ENABLE_IMPL_SKIP_COMPILER_MDSPAN) message(STATUS "Using compiler-supplied experimental/mdspan") else() - KOKKOS_LIB_INCLUDE_DIRECTORIES( - kokkoscore - ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include - ) + kokkos_lib_include_directories(kokkoscore ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/__p0009_bits/*.hpp) - APPEND_GLOB(KOKKOS_CORE_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/experimental/mdspan) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/__p0009_bits/*.hpp) + append_glob(KOKKOS_CORE_HEADERS ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/experimental/mdspan) - INSTALL (DIRECTORY - "${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include/" + install( + DIRECTORY "${KOKKOS_SOURCE_DIR}/tpls/mdspan/include/" DESTINATION ${KOKKOS_HEADER_DIR} FILES_MATCHING PATTERN "mdspan" PATTERN "*.hpp" ) - MESSAGE(STATUS "Using internal mdspan directory ${CMAKE_CURRENT_SOURCE_DIR}/../../tpls/mdspan/include") + message(STATUS "Using internal mdspan directory ${KOKKOS_SOURCE_DIR}/tpls/mdspan/include") endif() -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC HWLOC) -KOKKOS_LINK_TPL(kokkoscore PUBLIC CUDA) -KOKKOS_LINK_TPL(kokkoscore PUBLIC HPX) -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBDL) +kokkos_link_tpl(kokkoscore PUBLIC HWLOC) +kokkos_link_tpl(kokkoscore PUBLIC CUDA) +kokkos_link_tpl(kokkoscore PUBLIC HPX) +kokkos_link_tpl(kokkoscore PUBLIC LIBDL) # On *nix-like systems (Linux, macOS) we need pthread for C++ std::thread -IF (NOT WIN32) - KOKKOS_LINK_TPL(kokkoscore PUBLIC THREADS) -ENDIF() -IF (NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) - KOKKOS_LINK_TPL(kokkoscore PUBLIC ROCM) -ENDIF() +if(NOT WIN32) + kokkos_link_tpl(kokkoscore PUBLIC THREADS) +endif() +if(NOT KOKKOS_ENABLE_COMPILE_AS_CMAKE_LANGUAGE) + kokkos_link_tpl(kokkoscore PUBLIC ROCM) +endif() # FIXME: We need a proper solution to figure out whether to enable # libatomic # Most compilers only require libatomic for 128-bit CAS # I (CT) had removed 128bit CAS from desul to not need libatomic. -IF (KOKKOS_ENABLE_OPENMPTARGET) +if(KOKKOS_ENABLE_OPENMPTARGET) target_link_libraries(kokkoscore PUBLIC atomic) -ENDIF() +endif() -IF (desul_FOUND) +if(desul_FOUND) target_link_libraries(kokkoscore PUBLIC desul_atomics) -ENDIF() +endif() -# FIXME_TRILINOS Trilinos doesn't allow for Kokkos to use find_dependency so we -# just append the flags in cmake/kokkos_tpls.cmake instead of linking with the -# OpenMP target. -IF(Kokkos_ENABLE_OPENMP AND NOT KOKKOS_HAS_TRILINOS) +if(Kokkos_ENABLE_OPENMP) target_link_libraries(kokkoscore PUBLIC OpenMP::OpenMP_CXX) -ENDIF() +endif() -KOKKOS_LINK_TPL(kokkoscore PUBLIC LIBQUADMATH) +kokkos_link_tpl(kokkoscore PUBLIC LIBQUADMATH) diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp index fd86976d3b..07c35e6611 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda.hpp @@ -35,7 +35,6 @@ static_assert(false, #include // CUDA_SAFE_CALL #include -#include #include #include #include diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp index 6ae24022c8..8bcd6525c9 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.cpp @@ -201,7 +201,14 @@ void *impl_allocate_common(const int device_id, } } #elif (defined(KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC) && CUDART_VERSION >= 11020) - if (arg_alloc_size >= memory_threshold_g) { + // FIXME_KEPLER Everything after Kepler should support cudaMallocAsync + int device_supports_cuda_malloc_async; + KOKKOS_IMPL_CUDA_SAFE_CALL( + cudaDeviceGetAttribute(&device_supports_cuda_malloc_async, + cudaDevAttrMemoryPoolsSupported, device_id)); + + if (arg_alloc_size >= memory_threshold_g && + device_supports_cuda_malloc_async == 1) { error_code = cudaMallocAsync(&ptr, arg_alloc_size, stream); if (error_code == cudaSuccess) { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp index e1d062d72d..1ccf38a4a1 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_CudaSpace.hpp @@ -73,9 +73,9 @@ class CudaSpace { CudaSpace(int device_id, cudaStream_t stream); public: - CudaSpace(CudaSpace&& rhs) = default; - CudaSpace(const CudaSpace& rhs) = default; - CudaSpace& operator=(CudaSpace&& rhs) = default; + CudaSpace(CudaSpace&& rhs) = default; + CudaSpace(const CudaSpace& rhs) = default; + CudaSpace& operator=(CudaSpace&& rhs) = default; CudaSpace& operator=(const CudaSpace& rhs) = default; ~CudaSpace() = default; @@ -174,9 +174,9 @@ class CudaUVMSpace { CudaUVMSpace(int device_id, cudaStream_t stream); public: - CudaUVMSpace(CudaUVMSpace&& rhs) = default; - CudaUVMSpace(const CudaUVMSpace& rhs) = default; - CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(CudaUVMSpace&& rhs) = default; + CudaUVMSpace(const CudaUVMSpace& rhs) = default; + CudaUVMSpace& operator=(CudaUVMSpace&& rhs) = default; CudaUVMSpace& operator=(const CudaUVMSpace& rhs) = default; ~CudaUVMSpace() = default; @@ -266,9 +266,9 @@ class CudaHostPinnedSpace { CudaHostPinnedSpace(int device_id, cudaStream_t stream); public: - CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; - CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; - CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(CudaHostPinnedSpace&& rhs) = default; + CudaHostPinnedSpace(const CudaHostPinnedSpace& rhs) = default; + CudaHostPinnedSpace& operator=(CudaHostPinnedSpace&& rhs) = default; CudaHostPinnedSpace& operator=(const CudaHostPinnedSpace& rhs) = default; ~CudaHostPinnedSpace() = default; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp index 5a821ab64a..058b1f538d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_GraphNodeKernel.hpp @@ -51,7 +51,8 @@ class GraphNodeKernelImpl m_graph_node_ptr = nullptr; // Basically, we have to make this mutable for the same reasons that the // global kernel buffers in the Cuda instance are mutable... - mutable Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; public: using Policy = PolicyType; @@ -61,25 +62,20 @@ class GraphNodeKernelImpl - GraphNodeKernelImpl(std::string, Kokkos::Cuda const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, Cuda const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) // This is super ugly, but it works for now and is the most minimal change // to the codebase for now... - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} // FIXME @graph Forward through the instance once that works in the backends template GraphNodeKernelImpl(Kokkos::Cuda const& ex, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::CudaSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", ex, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_cuda_graph_ptr(cudaGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -90,13 +86,21 @@ class GraphNodeKernelImpl allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const CudaSpace& mem) const { KOKKOS_EXPECTS(m_driver_storage == nullptr) - m_driver_storage = static_cast(Kokkos::CudaSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast(mem.allocate(alloc_label.c_str(), sizeof(base_t))), + [alloc_label, mem](base_t* ptr) { + mem.deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr) - return m_driver_storage; + return m_driver_storage.get(); } + + auto get_driver_storage() const { return m_driver_storage; } }; struct CudaGraphNodeAggregateKernel { @@ -128,7 +132,8 @@ struct get_graph_node_kernel_type // {{{1 template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const CudaSpace& mem, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = @@ -136,7 +141,7 @@ auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to // just always do it) - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(mem); } template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp index 625d8c317a..8e800e756d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Graph_Impl.hpp @@ -51,7 +51,14 @@ struct GraphImpl { using node_details_t = GraphNodeBackendSpecificDetails; - void _instantiate_graph() { + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; + + public: + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; cudaGraphNode_t error_node = nullptr; char error_log[error_log_size]; @@ -60,10 +67,10 @@ struct GraphImpl { ->cuda_graph_instantiate_wrapper(&m_graph_exec, m_graph, &error_node, error_log, error_log_size))); + KOKKOS_ENSURES(m_graph_exec); // TODO @graphs print out errors } - public: using root_node_impl_t = GraphNodeImpl; @@ -74,11 +81,11 @@ struct GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl() { // TODO @graphs we need to somehow indicate the need for a fence in the // destructor of the GraphImpl object (so that we don't have to @@ -129,6 +136,8 @@ struct GraphImpl { kernel.set_cuda_graph_node_ptr(&cuda_node); kernel.execute(); KOKKOS_ENSURES(bool(cuda_node)); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } template @@ -158,13 +167,13 @@ struct GraphImpl { &cuda_node, 1))); } - void submit() { + void submit(const execution_space& exec) { if (!bool(m_graph_exec)) { - _instantiate_graph(); + instantiate(); } KOKKOS_IMPL_CUDA_SAFE_CALL( - (m_execution_space.impl_internal_space_instance() - ->cuda_graph_launch_wrapper(m_graph_exec))); + (exec.impl_internal_space_instance()->cuda_graph_launch_wrapper( + m_graph_exec))); } execution_space const& get_execution_space() const noexcept { @@ -197,6 +206,9 @@ struct GraphImpl { m_execution_space, _graph_node_kernel_ctor_tag{}, aggregate_kernel_impl_t{}); } + + cudaGraph_t cuda_graph() { return m_graph; } + cudaGraphExec_t cuda_graph_exec() { return m_graph_exec; } }; } // end namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp index 158c8acdda..ec5768a7f0 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Instance.cpp @@ -26,10 +26,10 @@ #include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include #include #include #include @@ -687,16 +687,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << " KOKKOS_ENABLE_CUDA: yes\n"; os << "Cuda Options:\n"; - os << " KOKKOS_ENABLE_CUDA_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CUDA_LAMBDA - os << "yes\n"; -#else - os << "no\n"; -#endif -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - os << " KOKKOS_ENABLE_CUDA_LDG_INTRINSIC: "; - os << "yes\n"; -#endif os << " KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE: "; #ifdef KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE os << "yes\n"; @@ -708,12 +698,6 @@ void Cuda::print_configuration(std::ostream &os, bool /*verbose*/) const { os << "yes\n"; #else os << "no\n"; -#endif - os << " KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA: "; -#ifdef KOKKOS_ENABLE_CXX11_DISPATCH_LAMBDA - os << "yes\n"; -#else - os << "no\n"; #endif os << " KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC: "; #ifdef KOKKOS_ENABLE_IMPL_CUDA_MALLOC_ASYNC diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp index b0dadb45f7..2d00e735cb 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_KernelLaunch.hpp @@ -209,8 +209,8 @@ inline void configure_shmem_preference(const int cuda_device, // Use multiples of 8kB const size_t max_shmem_per_sm = device_props.sharedMemPerMultiprocessor; size_t carveout = shmem_per_block == 0 - ? 0 - : 100 * + ? 0 + : 100 * (((num_blocks_desired * shmem_per_block + min_shmem_size_per_sm - 1) / min_shmem_size_per_sm) * @@ -491,7 +491,10 @@ struct CudaParallelLaunchKernelInvoker< cuda_instance->m_deviceProp, block_size, shmem, desired_occupancy); } - auto* driver_ptr = Impl::allocate_driver_storage_for_kernel(driver); + auto* driver_ptr = Impl::allocate_driver_storage_for_kernel( + CudaSpace::impl_create(cuda_instance->m_cudaDev, + cuda_instance->m_stream), + driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl @@ -714,7 +717,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::launch_kernel((Args &&) args...); + base_t::launch_kernel((Args&&)args...); } }; @@ -728,7 +731,7 @@ struct CudaParallelLaunch; template CudaParallelLaunch(Args&&... args) { - base_t::create_parallel_launch_graph_node((Args &&) args...); + base_t::create_parallel_launch_graph_node((Args&&)args...); } }; diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp index 6303898400..c50ff43034 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_MDRange.hpp @@ -95,11 +95,39 @@ class ParallelFor, Kokkos::Cuda> { inline void execute() const { if (m_rp.m_num_tiles == 0) return; - const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxblocks = m_rp.space().cuda_device_prop().maxGridSize; + const auto maxthreads = m_rp.space().cuda_device_prop().maxThreadsDim; + [[maybe_unused]] const auto maxThreadsPerBlock = + m_rp.space().cuda_device_prop().maxThreadsPerBlock; + // make sure the Z dimension (it is less than x,y limits) isn't exceeded + const auto clampZ = [&](const int input) { + return (input > maxthreads[2] ? maxthreads[2] : input); + }; + // make sure the block dimensions don't exceed the max number of threads + // allowed + const auto check_block_sizes = [&]([[maybe_unused]] const dim3& block) { + KOKKOS_ASSERT(block.x > 0 && + block.x <= static_cast(maxthreads[0])); + KOKKOS_ASSERT(block.y > 0 && + block.y <= static_cast(maxthreads[1])); + KOKKOS_ASSERT(block.z > 0 && + block.z <= static_cast(maxthreads[2])); + KOKKOS_ASSERT(block.x * block.y * block.z <= + static_cast(maxThreadsPerBlock)); + }; + // make sure the grid dimensions don't exceed the max number of blocks + // allowed + const auto check_grid_sizes = [&]([[maybe_unused]] const dim3& grid) { + KOKKOS_ASSERT(grid.x > 0 && + grid.x <= static_cast(maxblocks[0])); + KOKKOS_ASSERT(grid.y > 0 && + grid.y <= static_cast(maxblocks[1])); + KOKKOS_ASSERT(grid.z > 0 && + grid.z <= static_cast(maxblocks[2])); + }; if (RP::rank == 2) { const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], 1); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -108,13 +136,12 @@ class ParallelFor, Kokkos::Cuda> { (m_rp.m_upper[1] - m_rp.m_lower[1] + block.y - 1) / block.y, maxblocks[1]), 1); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 3) { - const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], m_rp.m_tile[2]); - KOKKOS_ASSERT(block.x > 0); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + const dim3 block(m_rp.m_tile[0], m_rp.m_tile[1], clampZ(m_rp.m_tile[2])); + check_block_sizes(block); const dim3 grid( std::min( (m_rp.m_upper[0] - m_rp.m_lower[0] + block.x - 1) / block.x, @@ -125,15 +152,16 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[2] - m_rp.m_lower[2] + block.z - 1) / block.z, maxblocks[2])); + // ensure we don't exceed the capability of the device + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 4) { // id0,id1 encoded within threadIdx.x; id2 to threadIdx.y; id3 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2], - m_rp.m_tile[3]); - KOKKOS_ASSERT(block.y > 0); - KOKKOS_ASSERT(block.z > 0); + clampZ(m_rp.m_tile[3])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -143,14 +171,15 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[3] - m_rp.m_lower[3] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 5) { // id0,id1 encoded within threadIdx.x; id2,id3 to threadIdx.y; id4 to // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], - m_rp.m_tile[2] * m_rp.m_tile[3], m_rp.m_tile[4]); - KOKKOS_ASSERT(block.z > 0); + m_rp.m_tile[2] * m_rp.m_tile[3], clampZ(m_rp.m_tile[4])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -159,6 +188,7 @@ class ParallelFor, Kokkos::Cuda> { std::min( (m_rp.m_upper[4] - m_rp.m_lower[4] + block.z - 1) / block.z, maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else if (RP::rank == 6) { @@ -166,7 +196,8 @@ class ParallelFor, Kokkos::Cuda> { // threadIdx.z const dim3 block(m_rp.m_tile[0] * m_rp.m_tile[1], m_rp.m_tile[2] * m_rp.m_tile[3], - m_rp.m_tile[4] * m_rp.m_tile[5]); + clampZ(m_rp.m_tile[4] * m_rp.m_tile[5])); + check_block_sizes(block); const dim3 grid( std::min(m_rp.m_tile_end[0] * m_rp.m_tile_end[1], maxblocks[0]), @@ -174,6 +205,7 @@ class ParallelFor, Kokkos::Cuda> { maxblocks[1]), std::min(m_rp.m_tile_end[4] * m_rp.m_tile_end[5], maxblocks[2])); + check_grid_sizes(grid); CudaParallelLaunch( *this, grid, block, 0, m_rp.space().impl_internal_space_instance()); } else { diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp index 334834938a..8251fcb248 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Range.hpp @@ -48,7 +48,7 @@ class ParallelFor, Kokkos::Cuda> { const FunctorType m_functor; const Policy m_policy; - ParallelFor() = delete; + ParallelFor() = delete; ParallelFor& operator=(const ParallelFor&) = delete; template diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp index 71e7751821..a2955e3ab6 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Parallel_Team.hpp @@ -539,9 +539,14 @@ class ParallelFor, m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -631,7 +636,7 @@ class ParallelReduce word_count(m_functor_reducer.get_reducer().value_size() / sizeof(word_size_type)); - reference_type value = m_functor_reducer.get_reducer().init( - kokkos_impl_cuda_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = + m_functor_reducer.get_reducer().init(reinterpret_cast( + kokkos_impl_cuda_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league const int int_league_size = (int)m_league_size; @@ -895,11 +901,16 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp index 86d6d91bbe..5090e84c38 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Task.hpp @@ -31,6 +31,9 @@ //---------------------------------------------------------------------------- +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() + #if defined(__CUDA_ARCH__) #define KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN(MSG) \ { \ @@ -584,9 +587,9 @@ class TaskExec { private: enum : int { WarpSize = Kokkos::Impl::CudaTraits::WarpSize }; - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; + TaskExec(TaskExec&&) = delete; + TaskExec(TaskExec const&) = delete; + TaskExec& operator=(TaskExec&&) = delete; TaskExec& operator=(TaskExec const&) = delete; friend class Kokkos::Impl::TaskQueue< @@ -1224,5 +1227,7 @@ KOKKOS_INLINE_FUNCTION void single( #undef KOKKOS_IMPL_CUDA_SYNCWARP_OR_RETURN +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() + #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ #endif /* #ifndef KOKKOS_IMPL_CUDA_TASK_HPP */ diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp index c2b5f1fa78..aec692c2c3 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Team.hpp @@ -184,24 +184,37 @@ class CudaTeamMember { * ( 1 == blockDim.z ) */ template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer) const noexcept { team_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION std::enable_if_t> team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE(( + typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( - (typename Impl::FunctorAnalysis< - Impl::FunctorPatternInterface::REDUCE, TeamPolicy, - ReducerType, typename ReducerType::value_type>::Reducer - wrapped_reducer(reducer); - cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y); - reducer.reference() = value;)) + (cuda_intra_block_reduction(value, wrapped_reducer, blockDim.y);)) } //-------------------------------------------------------------------------- @@ -260,23 +273,42 @@ class CudaTeamMember { //---------------------------------------- template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer) { vector_reduce(reducer, reducer.reference()); } template - KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> + KOKKOS_INLINE_FUNCTION static std::enable_if_t> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { (void)reducer; (void)value; + + KOKKOS_IF_ON_DEVICE( + (typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, typename ReducerType::value_type>::Reducer + wrapped_reducer(reducer); + + impl_vector_reduce(wrapped_reducer, value); + reducer.reference() = value;)) + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer_v> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { + (void)wrapped_reducer; + (void)value; + KOKKOS_IF_ON_DEVICE( (if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; unsigned mask = blockDim.x == 32 @@ -287,7 +319,7 @@ class CudaTeamMember { for (int i = blockDim.x; (i >>= 1);) { Impl::in_place_shfl_down(tmp2, tmp, i, blockDim.x, mask); if ((int)threadIdx.x < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -297,7 +329,7 @@ class CudaTeamMember { // and thus different threads could have different results. Impl::in_place_shfl(tmp2, tmp, 0, blockDim.x, mask); - value = tmp2; reducer.reference() = tmp2;)) + value = tmp2;)) } //---------------------------------------- @@ -487,14 +519,21 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { KOKKOS_IF_ON_DEVICE( - (typename ReducerType::value_type value; + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - reducer.init(value); + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value);)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); reducer.reference() = value;)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -518,16 +557,25 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); value_type value{}; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; - i < loop_boundaries.end; i += blockDim.y) { closure(i, val); } + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference();)) + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } template @@ -548,16 +596,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::CudaTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - KOKKOS_IF_ON_DEVICE((typename ReducerType::value_type value; - reducer.init(value); + KOKKOS_IF_ON_DEVICE( + (using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, value); } + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); + + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value;)) - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value);)) // Avoid bogus warning about reducer value being uninitialized with combined // reducers KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; @@ -573,18 +632,27 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE((ValueType val; Kokkos::Sum reducer(val); - reducer.init(reducer.reference()); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + - threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.y * blockDim.x) { closure(i, val); } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference();)) + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } + + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) } //---------------------------------------------------------------------------- @@ -632,13 +700,22 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< Closure const& closure, ReducerType const& reducer) { KOKKOS_IF_ON_DEVICE(( - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; - i += blockDim.x) { closure(i, reducer.reference()); } + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } - Impl::CudaTeamMember::vector_reduce(reducer); + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); reducer.reference() = value; )) // Avoid bogus warning about reducer value being uninitialized with combined @@ -667,15 +744,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< (void)loop_boundaries; (void)closure; (void)result; - KOKKOS_IF_ON_DEVICE( - (result = ValueType(); - for (iType i = loop_boundaries.start + threadIdx.x; - i < loop_boundaries.end; i += blockDim.x) { closure(i, result); } + KOKKOS_IF_ON_DEVICE(( - Impl::CudaTeamMember::vector_reduce(Kokkos::Sum(result)); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - )) + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); + + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::CudaTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value; + + )) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp index a3f4f2f4cc..9e0c5819f7 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_Vectorization.hpp @@ -125,8 +125,8 @@ struct in_place_shfl_op { struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -136,28 +136,28 @@ struct in_place_shfl_fn : in_place_shfl_op { }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { return __shfl_up_sync(mask, val, lane, width); } }; template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE T do_shfl_op(unsigned mask, T& val, - int lane, int width) const - noexcept { + int lane, + int width) const noexcept { (void)mask; (void)val; (void)lane; @@ -168,7 +168,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp index 517c592af7..0ac2d4052d 100644 --- a/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Cuda/Kokkos_Cuda_ZeroMemset.hpp @@ -23,15 +23,12 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Cuda& exec_space_instance, - const View& dst) { +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::Cuda& exec_space_instance, void* dst, size_t cnt) { KOKKOS_IMPL_CUDA_SAFE_CALL( (exec_space_instance.impl_internal_space_instance() - ->cuda_memset_async_wrapper( - dst.data(), 0, - dst.size() * sizeof(typename View::value_type)))); + ->cuda_memset_async_wrapper(dst, 0, cnt))); } }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp index aced2083ff..8de3a8758f 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP.cpp @@ -27,6 +27,8 @@ #include +#include + namespace Kokkos { #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 @@ -49,34 +51,44 @@ void HIP::impl_initialize(InitializationSettings const& settings) { Impl::HIPInternal::m_hipDev = hip_device_id; KOKKOS_IMPL_HIP_SAFE_CALL( hipGetDeviceProperties(&Impl::HIPInternal::m_deviceProp, hip_device_id)); - const auto& hipProp = Impl::HIPInternal::m_deviceProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipSetDevice(hip_device_id)); - // number of multiprocessors - Impl::HIPInternal::m_multiProcCount = hipProp.multiProcessorCount; - - //---------------------------------- - // Maximum number of warps, - // at most one warp per thread in a warp for reduction. - Impl::HIPInternal::m_maxWarpCount = - hipProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; - if (Impl::HIPTraits::WarpSize < Impl::HIPInternal::m_maxWarpCount) { - Impl::HIPInternal::m_maxWarpCount = Impl::HIPTraits::WarpSize; + // Check that we are running on the expected architecture. We print a warning + // instead of erroring out because AMD does not guarantee that gcnArchName + // will always contain the gfx flag. + if (Kokkos::show_warnings()) { + if (std::string_view arch_name = + Impl::HIPInternal::m_deviceProp.gcnArchName; + arch_name.find(KOKKOS_ARCH_AMD_GPU) != 0) { + std::cerr + << "Kokkos::HIP::initialize WARNING: running kernels compiled for " + << KOKKOS_ARCH_AMD_GPU << " on " << arch_name << " device.\n"; + } } - //---------------------------------- - // Maximum number of blocks - Impl::HIPInternal::m_maxBlock[0] = hipProp.maxGridSize[0]; - Impl::HIPInternal::m_maxBlock[1] = hipProp.maxGridSize[1]; - Impl::HIPInternal::m_maxBlock[2] = hipProp.maxGridSize[2]; + // Print a warning if the user did not select the right GFX942 architecture +#ifdef KOKKOS_ARCH_AMD_GFX942 + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 1)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300X " + "(discrete GPU) on a MI300A (APU).\n"; + } +#endif +#ifdef KOKKOS_ARCH_AMD_GFX942_APU + if ((Kokkos::show_warnings()) && + (Impl::HIPInternal::m_deviceProp.integrated == 0)) { + std::cerr << "Kokkos::HIP::initialize WARNING: running kernels for MI300A " + "(APU) on a MI300X (discrete GPU).\n"; + } +#endif - // theoretically, we can get 40 WF's / CU, but only can sustain 32 see - // https://github.com/ROCm-Developer-Tools/HIP/blob/a0b5dfd625d99af7e288629747b40dd057183173/vdi/hip_platform.cpp#L742 - Impl::HIPInternal::m_maxWavesPerCU = 32; - Impl::HIPInternal::m_shmemPerSM = hipProp.maxSharedMemoryPerMultiProcessor; - Impl::HIPInternal::m_maxShmemPerBlock = hipProp.sharedMemPerBlock; + // theoretically on GFX 9XX GPUs, we can get 40 WF's / CU, but only can + // sustain 32 see + // https://github.com/ROCm/clr/blob/4d0b815d06751735e6a50fa46e913fdf85f751f0/hipamd/src/hip_platform.cpp#L362-L366 + const int maxWavesPerCU = + Impl::HIPInternal::m_deviceProp.major <= 9 ? 32 : 64; Impl::HIPInternal::m_maxThreadsPerSM = - Impl::HIPInternal::m_maxWavesPerCU * Impl::HIPTraits::WarpSize; + maxWavesPerCU * Impl::HIPTraits::WarpSize; // Init the array for used for arbitrarily sized atomics desul::Impl::init_lock_arrays(); // FIXME @@ -146,10 +158,6 @@ void HIP::print_configuration(std::ostream& os, bool /*verbose*/) const { #else os << "no\n"; #endif -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY - os << " KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY: "; - os << "yes\n"; -#endif os << "\nRuntime Configuration:\n"; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp index 1f084c41e5..90e5cf7355 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_BlockSize_Deduction.hpp @@ -113,8 +113,9 @@ unsigned hip_internal_get_block_size(const HIPInternal *hip_instance, const unsigned min_waves_per_eu = LaunchBounds::minBperSM ? LaunchBounds::minBperSM : 1; const unsigned min_threads_per_sm = min_waves_per_eu * HIPTraits::WarpSize; - const unsigned shmem_per_sm = hip_instance->m_shmemPerSM; - unsigned block_size = tperb_reg; + const unsigned shmem_per_sm = + hip_instance->m_deviceProp.maxSharedMemoryPerMultiProcessor; + unsigned block_size = tperb_reg; do { unsigned total_shmem = f(block_size); // find how many threads we can fit with this blocksize based on LDS usage diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp index 5f0df72df1..584cc63d95 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_GraphNodeKernel.hpp @@ -44,22 +44,17 @@ class GraphNodeKernelImpl // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::HIP const&, Functor arg_functor, + GraphNodeKernelImpl(std::string label_, HIP const&, Functor arg_functor, PolicyDeduced&& arg_policy, ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...), + label(std::move(label_)) {} template GraphNodeKernelImpl(Kokkos::HIP const& exec_space, Functor arg_functor, PolicyDeduced&& arg_policy) - : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} - - ~GraphNodeKernelImpl() { - if (m_driver_storage) { - Kokkos::HIPSpace().deallocate(m_driver_storage, sizeof(base_t)); - } - } + : GraphNodeKernelImpl("[unlabeled]", exec_space, std::move(arg_functor), + (PolicyDeduced&&)arg_policy) {} void set_hip_graph_ptr(hipGraph_t* arg_graph_ptr) { m_graph_ptr = arg_graph_ptr; @@ -73,18 +68,29 @@ class GraphNodeKernelImpl hipGraph_t const* get_hip_graph_ptr() const { return m_graph_ptr; } - Kokkos::ObservingRawPtr allocate_driver_memory_buffer() const { + Kokkos::ObservingRawPtr allocate_driver_memory_buffer( + const HIP& exec) const { KOKKOS_EXPECTS(m_driver_storage == nullptr); - m_driver_storage = static_cast(Kokkos::HIPSpace().allocate( - "GraphNodeKernel global memory functor storage", sizeof(base_t))); + std::string alloc_label = + label + " - GraphNodeKernel global memory functor storage"; + m_driver_storage = std::shared_ptr( + static_cast( + HIPSpace().allocate(exec, alloc_label.c_str(), sizeof(base_t))), + // FIXME_HIP Custom deletor should use same 'exec' as for allocation. + [alloc_label](base_t* ptr) { + HIPSpace().deallocate(alloc_label.c_str(), ptr, sizeof(base_t)); + }); KOKKOS_ENSURES(m_driver_storage != nullptr); - return m_driver_storage; + return m_driver_storage.get(); } + auto get_driver_storage() const { return m_driver_storage; } + private: Kokkos::ObservingRawPtr m_graph_ptr = nullptr; Kokkos::ObservingRawPtr m_graph_node_ptr = nullptr; - Kokkos::OwningRawPtr m_driver_storage = nullptr; + mutable std::shared_ptr m_driver_storage = nullptr; + std::string label; }; struct HIPGraphNodeAggregateKernel { @@ -114,13 +120,14 @@ struct get_graph_node_kernel_type Kokkos::ParallelReduceTag>> {}; template -auto* allocate_driver_storage_for_kernel(KernelType const& kernel) { +auto* allocate_driver_storage_for_kernel(const HIP& exec, + KernelType const& kernel) { using graph_node_kernel_t = typename get_graph_node_kernel_type::type; auto const& kernel_as_graph_kernel = static_cast(kernel); - return kernel_as_graph_kernel.allocate_driver_memory_buffer(); + return kernel_as_graph_kernel.allocate_driver_memory_buffer(exec); } template diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp index a0989fe671..4f97214ca6 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Graph_Impl.hpp @@ -42,11 +42,11 @@ class GraphImpl { // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); @@ -60,7 +60,7 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::HIP& exec); Kokkos::HIP const& get_execution_space() const noexcept; @@ -69,18 +69,28 @@ class GraphImpl { template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec); constexpr size_t error_log_size = 256; hipGraphNode_t error_node = nullptr; char error_log[error_log_size]; KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphInstantiate( &m_graph_exec, m_graph, &error_node, error_log, error_log_size)); + KOKKOS_ENSURES(m_graph_exec); } + hipGraph_t hip_graph() { return m_graph; } + hipGraphExec_t hip_graph_exec() { return m_graph_exec; } + + private: Kokkos::HIP m_execution_space; hipGraph_t m_graph = nullptr; hipGraphExec_t m_graph_exec = nullptr; + + // Store drivers for the kernel nodes that launch in global memory. + // This is required as lifetime of drivers must be bounded to this instance's + // lifetime. + std::vector> m_driver_storage; }; inline GraphImpl::~GraphImpl() { @@ -123,6 +133,8 @@ inline void GraphImpl::add_node( kernel.set_hip_graph_node_ptr(&node); kernel.execute(); KOKKOS_ENSURES(node); + if (std::shared_ptr tmp = kernel.get_driver_storage()) + m_driver_storage.push_back(std::move(tmp)); } // Requires PredecessorRef is a specialization of GraphNodeRef that has @@ -145,16 +157,15 @@ inline void GraphImpl::add_predecessor( hipGraphAddDependencies(m_graph, &pred_node, &node, 1)); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::HIP& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - KOKKOS_IMPL_HIP_SAFE_CALL( - hipGraphLaunch(m_graph_exec, m_execution_space.hip_stream())); + KOKKOS_IMPL_HIP_SAFE_CALL(hipGraphLaunch(m_graph_exec, exec.hip_stream())); } -inline Kokkos::HIP const& GraphImpl::get_execution_space() const - noexcept { +inline Kokkos::HIP const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp index e0b25c6939..54e8c315e3 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.cpp @@ -77,7 +77,8 @@ std::size_t scratch_count(const std::size_t size) { //---------------------------------------------------------------------------- int HIPInternal::concurrency() { - static int const concurrency = m_maxThreadsPerSM * m_multiProcCount; + static int const concurrency = + m_maxThreadsPerSM * m_deviceProp.multiProcessorCount; return concurrency; } @@ -97,6 +98,13 @@ void HIPInternal::print_configuration(std::ostream &s) const { << "undefined\n"; #endif + s << "macro KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC: "; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + s << "yes\n"; +#else + s << "no\n"; +#endif + for (int i : get_visible_devices()) { hipDeviceProp_t hipProp; KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceProperties(&hipProp, i)); @@ -177,8 +185,16 @@ void HIPInternal::initialize(hipStream_t stream) { // and scratch space for partial reduction values. // Allocate some initial space. This will grow as needed. { + // Maximum number of warps, + // at most one warp per thread in a warp for reduction. + unsigned int maxWarpCount = + m_deviceProp.maxThreadsPerBlock / Impl::HIPTraits::WarpSize; + if (Impl::HIPTraits::WarpSize < maxWarpCount) { + maxWarpCount = Impl::HIPTraits::WarpSize; + } + const unsigned reduce_block_count = - m_maxWarpCount * Impl::HIPTraits::WarpSize; + maxWarpCount * Impl::HIPTraits::WarpSize; (void)scratch_flags(reduce_block_count * 2 * sizeof(size_type)); (void)scratch_space(reduce_block_count * 16 * sizeof(size_type)); @@ -353,14 +369,8 @@ void HIPInternal::finalize() { m_num_scratch_locks = 0; } -int HIPInternal::m_hipDev = -1; -unsigned HIPInternal::m_multiProcCount = 0; -unsigned HIPInternal::m_maxWarpCount = 0; -std::array HIPInternal::m_maxBlock = {0, 0, 0}; -unsigned HIPInternal::m_maxWavesPerCU = 0; -int HIPInternal::m_shmemPerSM = 0; -int HIPInternal::m_maxShmemPerBlock = 0; -int HIPInternal::m_maxThreadsPerSM = 0; +int HIPInternal::m_hipDev = -1; +int HIPInternal::m_maxThreadsPerSM = 0; hipDeviceProp_t HIPInternal::m_deviceProp; @@ -372,15 +382,7 @@ std::mutex HIPInternal::constantMemMutex; //---------------------------------------------------------------------------- Kokkos::HIP::size_type hip_internal_multiprocessor_count() { - return HIPInternal::singleton().m_multiProcCount; -} - -Kokkos::HIP::size_type hip_internal_maximum_warp_count() { - return HIPInternal::singleton().m_maxWarpCount; -} - -std::array hip_internal_maximum_grid_count() { - return HIPInternal::singleton().m_maxBlock; + return HIPInternal::singleton().m_deviceProp.multiProcessorCount; } Kokkos::HIP::size_type *hip_internal_scratch_space(const HIP &instance, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp index 437a84253f..d8043dc23d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Instance.hpp @@ -31,7 +31,7 @@ namespace Impl { struct HIPTraits { #if defined(KOKKOS_ARCH_AMD_GFX906) || defined(KOKKOS_ARCH_AMD_GFX908) || \ defined(KOKKOS_ARCH_AMD_GFX90A) || defined(KOKKOS_ARCH_AMD_GFX940) || \ - defined(KOKKOS_ARCH_AMD_GFX942) + defined(KOKKOS_ARCH_AMD_GFX942) || defined(KOKKOS_ARCH_AMD_GFX942_APU) static constexpr int WarpSize = 64; static constexpr int WarpIndexMask = 0x003f; /* hexadecimal for 63 */ static constexpr int WarpIndexShift = 6; /* WarpSize == 1 << WarpShift*/ @@ -52,8 +52,6 @@ struct HIPTraits { //---------------------------------------------------------------------------- -HIP::size_type hip_internal_maximum_warp_count(); -std::array hip_internal_maximum_grid_count(); HIP::size_type hip_internal_multiprocessor_count(); HIP::size_type *hip_internal_scratch_space(const HIP &instance, @@ -72,12 +70,6 @@ class HIPInternal { using size_type = ::Kokkos::HIP::size_type; static int m_hipDev; - static unsigned m_multiProcCount; - static unsigned m_maxWarpCount; - static std::array m_maxBlock; - static unsigned m_maxWavesPerCU; - static int m_shmemPerSM; - static int m_maxShmemPerBlock; static int m_maxThreadsPerSM; static hipDeviceProp_t m_deviceProp; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp index 7cd0afcf47..e243eb07e7 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_KernelLaunch.hpp @@ -25,11 +25,7 @@ #include #include -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) -#define KOKKOS_IMPL_HIP_GRAPH_ENABLED -#endif - -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH #include #include #endif @@ -173,15 +169,15 @@ struct DeduceHIPLaunchMechanism { static constexpr HIPLaunchMechanism launch_mechanism = ((property & force_global_launch) == force_global_launch) ? HIPLaunchMechanism::GlobalMemory - : ((property & light_weight) == light_weight) - ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit - ? HIPLaunchMechanism::LocalMemory - : HIPLaunchMechanism::GlobalMemory) - : (((property & heavy_weight) == heavy_weight) - ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage - ? HIPLaunchMechanism::ConstantMemory - : HIPLaunchMechanism::GlobalMemory) - : (default_launch_mechanism)); + : ((property & light_weight) == light_weight) + ? (sizeof(DriverType) < HIPTraits::KernelArgumentLimit + ? HIPLaunchMechanism::LocalMemory + : HIPLaunchMechanism::GlobalMemory) + : (((property & heavy_weight) == heavy_weight) + ? (sizeof(DriverType) < HIPTraits::ConstantMemoryUsage + ? HIPLaunchMechanism::ConstantMemory + : HIPLaunchMechanism::GlobalMemory) + : (default_launch_mechanism)); }; template m_stream, ManageStream::no), driver); // Unlike in the non-graph case, we can get away with doing an async copy // here because the `DriverType` instance is held in the GraphNodeImpl // which is guaranteed to be alive until the graph instance itself is // destroyed, where there should be a fence ensuring that the allocation // associated with this kernel on the device side isn't deleted. - hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), hipMemcpyDefault, - hip_instance->m_stream); + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemcpyAsync(driver_ptr, &driver, sizeof(DriverType), + hipMemcpyDefault, hip_instance->m_stream)); void const *args[] = {&driver_ptr}; @@ -551,11 +549,11 @@ struct HIPParallelLaunch< LaunchMechanism>; HIPParallelLaunch(const DriverType &driver, const dim3 &grid, - const dim3 &block, const int shmem, + const dim3 &block, const unsigned int shmem, const HIPInternal *hip_instance, const bool /*prefer_shmem*/) { if ((grid.x != 0) && ((block.x * block.y * block.z) != 0)) { - if (hip_instance->m_maxShmemPerBlock < shmem) { + if (hip_instance->m_deviceProp.sharedMemPerBlock < shmem) { Kokkos::Impl::throw_runtime_exception( "HIPParallelLaunch FAILED: shared memory request is too large"); } @@ -585,7 +583,7 @@ void hip_parallel_launch(const DriverType &driver, const dim3 &grid, const dim3 &block, const int shmem, const HIPInternal *hip_instance, const bool prefer_shmem) { -#ifdef KOKKOS_IMPL_HIP_GRAPH_ENABLED +#ifdef KOKKOS_IMPL_HIP_NATIVE_GRAPH if constexpr (DoGraph) { // Graph launch using base_t = HIPParallelLaunchKernelInvoker, HIP> { const Policy m_policy; public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { @@ -57,7 +57,7 @@ class ParallelFor, HIP> { inline void execute() const { using ClosureType = ParallelFor; if (m_policy.m_num_tiles == 0) return; - auto const maxblocks = hip_internal_maximum_grid_count(); + auto const maxblocks = m_policy.space().hip_device_prop().maxGridSize; if (Policy::rank == 2) { dim3 const block(m_policy.m_tile[0], m_policy.m_tile[1], 1); dim3 const grid( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp index 9355c1c75f..3985dc60f0 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Range.hpp @@ -53,8 +53,8 @@ class ParallelFor, Kokkos::HIP> { public: using functor_type = FunctorType; - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; inline __device__ void operator()() const { diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp index bf0c219338..83e890bce9 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelFor_Team.hpp @@ -71,8 +71,8 @@ class ParallelFor, HIP> { } public: - ParallelFor() = delete; - ParallelFor(ParallelFor const&) = default; + ParallelFor() = delete; + ParallelFor(ParallelFor const&) = default; ParallelFor& operator=(ParallelFor const&) = delete; __device__ inline void operator()() const { @@ -120,9 +120,14 @@ class ParallelFor, HIP> { m_vector_size(arg_policy.impl_vector_length()) { auto internal_space_instance = m_policy.space().impl_internal_space_instance(); - m_team_size = m_team_size >= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor, ParallelForTag()); + if (m_team_size < 0) { + m_team_size = + arg_policy.team_size_recommended(arg_functor, ParallelForTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = @@ -149,8 +154,9 @@ class ParallelFor, HIP> { static_cast(m_league_size)))); } - int const shmem_size_total = m_shmem_begin + m_shmem_size; - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + unsigned int const shmem_size_total = m_shmem_begin + m_shmem_size; + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception(std::string( "Kokkos::Impl::ParallelFor< HIP > insufficient shared memory")); } diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp index 0c24e5cc62..fb4ff937cd 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ParallelReduce_Team.hpp @@ -46,6 +46,22 @@ class ParallelReduce 4 bytes in size, indexing into shared/global memory relies + // on the block and grid dimensions to ensure that we index at the correct + // offset rather than at every 4 byte word; such that, when the join is + // performed, we have the correct data that was copied over in chunks of 4 + // bytes. + using word_size_type = std::conditional_t< + sizeof(value_type) < sizeof(Kokkos::HIP::size_type), + std::conditional_t, + Kokkos::HIP::size_type>; using reducer_type = ReducerType; using size_type = HIP::size_type; @@ -72,7 +88,7 @@ class ParallelReduce const - word_count(reducer.value_size() / sizeof(size_type)); + integral_nonzero_constant const + word_count(reducer.value_size() / sizeof(word_size_type)); - reference_type value = - reducer.init(kokkos_impl_hip_shared_memory() + - threadIdx.y * word_count.value); + reference_type value = reducer.init(reinterpret_cast( + kokkos_impl_hip_shared_memory() + + threadIdx.y * word_count.value)); // Iterate this block through the league iterate_through_league(threadid, value); // Reduce with final value at blockDim.y - 1 location. bool do_final_reduce = (m_league_size == 0); if (!do_final_reduce) - do_final_reduce = - hip_single_inter_block_reduce_scan( - reducer, blockIdx.x, gridDim.x, - kokkos_impl_hip_shared_memory(), m_scratch_space, - m_scratch_flags); + do_final_reduce = hip_single_inter_block_reduce_scan( + reducer, blockIdx.x, gridDim.x, + kokkos_impl_hip_shared_memory(), m_scratch_space, + m_scratch_flags); if (do_final_reduce) { // This is the final block with the final result at the final threads' // location - size_type* const shared = kokkos_impl_hip_shared_memory() + - (blockDim.y - 1) * word_count.value; - size_type* const global = m_result_ptr_device_accessible - ? reinterpret_cast(m_result_ptr) - : m_scratch_space; + word_size_type* const shared = + kokkos_impl_hip_shared_memory() + + (blockDim.y - 1) * word_count.value; + size_type* const global = + m_result_ptr_device_accessible + ? reinterpret_cast(m_result_ptr) + : m_scratch_space; if (threadIdx.y == 0) { reducer.final(reinterpret_cast(shared)); @@ -227,7 +244,8 @@ class ParallelReduce(m_scratch_space), result, m_scratch_flags, blockDim.y)) { unsigned int const id = threadIdx.y * blockDim.x + threadIdx.x; if (id == 0) { @@ -249,8 +267,9 @@ class ParallelReduce(hip_internal_scratch_space( + m_policy.space(), reducer.value_size() * block_count)); m_scratch_flags = hip_internal_scratch_flags(m_policy.space(), sizeof(size_type)); @@ -306,11 +325,15 @@ class ParallelReduce= 0 ? m_team_size - : arg_policy.team_size_recommended( - arg_functor_reducer.get_functor(), - arg_functor_reducer.get_reducer(), - ParallelReduceTag()); + if (m_team_size < 0) { + m_team_size = arg_policy.team_size_recommended( + arg_functor_reducer.get_functor(), arg_functor_reducer.get_reducer(), + ParallelReduceTag()); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } m_team_begin = UseShflReduction @@ -356,7 +379,8 @@ class ParallelReduce bad team size")); } - if (internal_space_instance->m_maxShmemPerBlock < shmem_size_total) { + if (internal_space_instance->m_deviceProp.sharedMemPerBlock < + shmem_size_total) { Kokkos::Impl::throw_runtime_exception( std::string("Kokkos::Impl::ParallelReduce< HIP > requested too much " "L0 scratch memory")); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp index 83f829fdda..0b67921809 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.cpp @@ -23,7 +23,7 @@ #include #include -#ifndef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( Kokkos::HIPSpace); #else diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp index 1ca7bd5cd0..a464609108 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_SharedAllocationRecord.hpp @@ -20,7 +20,7 @@ #include #include -#if defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::HIPSpace); #else KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp index 4035bb0121..feee44ccaf 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Shuffle_Reduce.hpp @@ -100,7 +100,7 @@ template __device__ inline bool hip_inter_block_shuffle_reduction( typename FunctorType::reference_type value, typename FunctorType::reference_type neutral, FunctorType const& reducer, - HIP::size_type* const m_scratch_space, + typename FunctorType::pointer_type const m_scratch_space, typename FunctorType::pointer_type const /*result*/, HIP::size_type* const m_scratch_flags, int const max_active_thread = blockDim.y) { @@ -115,9 +115,8 @@ __device__ inline bool hip_inter_block_shuffle_reduction( // One thread in the block writes block result to global scratch_memory if (id == 0) { - pointer_type global = - reinterpret_cast(m_scratch_space) + blockIdx.x; - *global = value; + pointer_type global = m_scratch_space + blockIdx.x; + *global = value; __threadfence(); } @@ -140,8 +139,7 @@ __device__ inline bool hip_inter_block_shuffle_reduction( last_block = true; value = neutral; - pointer_type const global = - reinterpret_cast(m_scratch_space); + pointer_type const global = m_scratch_space; // Reduce all global values with splitting work over threads in one warp const int step_size = blockDim.x * blockDim.y < warp_size diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp index 67635fc1c4..47f07b31ab 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.cpp @@ -51,28 +51,54 @@ static std::atomic is_first_hip_managed_allocation(true); namespace Kokkos { -HIPSpace::HIPSpace() : m_device(HIP().hip_device()) {} +HIPSpace::HIPSpace() + : m_device(HIP().hip_device()), m_stream(HIP().hip_stream()) {} HIPHostPinnedSpace::HIPHostPinnedSpace() {} HIPManagedSpace::HIPManagedSpace() : m_device(HIP().hip_device()) {} +#ifndef KOKKOS_IMPL_HIP_UNIFIED_MEMORY +void* HIPSpace::allocate(const HIP& exec_space, + const size_t arg_alloc_size) const { + return allocate(exec_space, "[unlabeled]", arg_alloc_size); +} + +void* HIPSpace::allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(exec_space.hip_stream(), arg_label, arg_alloc_size, + arg_logical_size, true); +} +#endif + void* HIPSpace::allocate(const size_t arg_alloc_size) const { return allocate("[unlabeled]", arg_alloc_size); } -void* HIPSpace::allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size) const { - return impl_allocate(arg_label, arg_alloc_size, arg_logical_size); +void* HIPSpace::allocate(const char* arg_label, const size_t arg_alloc_size, + const size_t arg_logical_size) const { + return impl_allocate(m_stream, arg_label, arg_alloc_size, arg_logical_size, + false); } + void* HIPSpace::impl_allocate( - const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size, - const Kokkos::Tools::SpaceHandle arg_handle) const { + [[maybe_unused]] const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, const size_t arg_logical_size, + [[maybe_unused]] const bool stream_sync_only) const { void* ptr = nullptr; +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + auto const error_code = hipMallocAsync(&ptr, arg_alloc_size, stream); + if (stream_sync_only) { + KOKKOS_IMPL_HIP_SAFE_CALL(hipStreamSynchronize(stream)); + } else { + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); + } +#else auto const error_code = hipMalloc(&ptr, arg_alloc_size); +#endif + if (error_code != hipSuccess) { // This is the only way to clear the last error, which we should do here // since we're turning it into an exception here @@ -80,6 +106,8 @@ void* HIPSpace::impl_allocate( Kokkos::Impl::throw_bad_alloc(name(), arg_alloc_size, arg_label); } if (Kokkos::Profiling::profileLibraryLoaded()) { + const Kokkos::Tools::SpaceHandle arg_handle = + Kokkos::Tools::make_space_handle(name()); const size_t reported_size = (arg_logical_size > 0) ? arg_logical_size : arg_alloc_size; Kokkos::Profiling::allocateData(arg_handle, arg_label, ptr, reported_size); @@ -219,7 +247,12 @@ void HIPSpace::impl_deallocate( Kokkos::Profiling::deallocateData(arg_handle, arg_label, arg_alloc_ptr, reported_size); } +#ifdef KOKKOS_ENABLE_IMPL_HIP_MALLOC_ASYNC + KOKKOS_IMPL_HIP_SAFE_CALL(hipFreeAsync(arg_alloc_ptr, m_stream)); + KOKKOS_IMPL_HIP_SAFE_CALL(hipDeviceSynchronize()); +#else KOKKOS_IMPL_HIP_SAFE_CALL(hipFree(arg_alloc_ptr)); +#endif } void HIPHostPinnedSpace::deallocate(void* const arg_alloc_ptr, diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp index e1b4768b87..2380772cac 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Space.hpp @@ -58,14 +58,14 @@ class HIPSpace { /*--------------------------------*/ HIPSpace(); - HIPSpace(HIPSpace&& rhs) = default; - HIPSpace(const HIPSpace& rhs) = default; - HIPSpace& operator=(HIPSpace&& rhs) = default; + HIPSpace(HIPSpace&& rhs) = default; + HIPSpace(const HIPSpace& rhs) = default; + HIPSpace& operator=(HIPSpace&& rhs) = default; HIPSpace& operator=(const HIPSpace& rhs) = default; ~HIPSpace() = default; /**\brief Allocate untracked memory in the hip space */ -#ifdef KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY +#ifdef KOKKOS_IMPL_HIP_UNIFIED_MEMORY template void* allocate(const ExecutionSpace&, const size_t arg_alloc_size) const { return allocate(arg_alloc_size); @@ -77,15 +77,10 @@ class HIPSpace { return allocate(arg_label, arg_alloc_size, arg_logical_size); } #else - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const size_t arg_alloc_size) const { - return allocate(arg_alloc_size); - } - // FIXME_HIP Use execution space instance - void* allocate(const HIP&, const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0) const { - return allocate(arg_label, arg_alloc_size, arg_logical_size); - } + void* allocate(const HIP& exec_space, const size_t arg_alloc_size) const; + void* allocate(const HIP& exec_space, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size = 0) const; #endif void* allocate(const size_t arg_alloc_size) const; void* allocate(const char* arg_label, const size_t arg_alloc_size, @@ -98,10 +93,10 @@ class HIPSpace { const size_t arg_logical_size = 0) const; private: - void* impl_allocate(const char* arg_label, const size_t arg_alloc_size, - const size_t arg_logical_size = 0, - const Kokkos::Tools::SpaceHandle = - Kokkos::Tools::make_space_handle(name())) const; + void* impl_allocate(const hipStream_t stream, const char* arg_label, + const size_t arg_alloc_size, + const size_t arg_logical_size, + bool stream_sync_only) const; void impl_deallocate(const char* arg_label, void* const arg_alloc_ptr, const size_t arg_alloc_size, const size_t arg_logical_size = 0, @@ -114,6 +109,7 @@ class HIPSpace { private: int m_device; ///< Which HIP device + hipStream_t m_stream; }; template <> @@ -140,9 +136,9 @@ class HIPHostPinnedSpace { /*--------------------------------*/ HIPHostPinnedSpace(); - HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; - HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; - HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(HIPHostPinnedSpace&& rhs) = default; + HIPHostPinnedSpace(const HIPHostPinnedSpace& rhs) = default; + HIPHostPinnedSpace& operator=(HIPHostPinnedSpace&& rhs) = default; HIPHostPinnedSpace& operator=(const HIPHostPinnedSpace& rhs) = default; ~HIPHostPinnedSpace() = default; @@ -213,9 +209,9 @@ class HIPManagedSpace { /*--------------------------------*/ HIPManagedSpace(); - HIPManagedSpace(HIPManagedSpace&& rhs) = default; - HIPManagedSpace(const HIPManagedSpace& rhs) = default; - HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(HIPManagedSpace&& rhs) = default; + HIPManagedSpace(const HIPManagedSpace& rhs) = default; + HIPManagedSpace& operator=(HIPManagedSpace&& rhs) = default; HIPManagedSpace& operator=(const HIPManagedSpace& rhs) = default; ~HIPManagedSpace() = default; @@ -280,7 +276,7 @@ static_assert(Kokkos::Impl::MemorySpaceAccess::assignable); template <> struct MemorySpaceAccess { enum : bool { assignable = false }; -#if !defined(KOKKOS_ENABLE_IMPL_HIP_UNIFIED_MEMORY) +#if !defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) enum : bool{accessible = false}; #else enum : bool { accessible = true }; diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp index fb466d8a72..1724b4361d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Team.hpp @@ -183,7 +183,7 @@ class HIPTeamMember { typename Kokkos::Impl::FunctorAnalysis< FunctorPatternInterface::REDUCE, TeamPolicy, ReducerType, typename ReducerType::value_type>::Reducer wrapped_reducer(reducer); - hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); + impl_team_reduce(wrapped_reducer, value); reducer.reference() = value; #else (void)reducer; @@ -191,6 +191,19 @@ class HIPTeamMember { #endif } + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { +#ifdef __HIP_DEVICE_COMPILE__ + hip_intra_block_shuffle_reduction(value, wrapped_reducer, blockDim.y); +#else + (void)wrapped_reducer; + (void)value; +#endif + } + //-------------------------------------------------------------------------- /** \brief Intra-team exclusive prefix sum with team_rank() ordering * with intra-team non-deterministic ordering accumulation. @@ -261,17 +274,37 @@ class HIPTeamMember { KOKKOS_INLINE_FUNCTION static std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) { +#ifdef __HIP_DEVICE_COMPILE__ + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; +#else + (void)reducer; + (void)value; +#endif + } + + template + KOKKOS_INLINE_FUNCTION static std::enable_if_t< + is_reducer::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) { #ifdef __HIP_DEVICE_COMPILE__ if (blockDim.x == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = blockDim.x; (i >>= 1);) { in_place_shfl_down(tmp2, tmp, i, blockDim.x); if (static_cast(threadIdx.x) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -281,10 +314,9 @@ class HIPTeamMember { // and thus different threads could have different results. in_place_shfl(tmp2, tmp, 0, blockDim.x); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; #else - (void)reducer; + (void)wrapped_reducer; (void)value; #endif } @@ -479,15 +511,26 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -508,24 +551,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y; i < loop_boundaries.end; - i += blockDim.y) { - closure(i, val); - } + for (iType i = loop_boundaries.start + threadIdx.y; + i < loop_boundaries.end; i += blockDim.y) { closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); result = value;)) + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } /** \brief Inter-thread parallel exclusive prefix sum. @@ -620,16 +663,26 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; i < loop_boundaries.end; i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -642,25 +695,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - ValueType val; - Kokkos::Sum reducer(val); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - for (iType i = loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; - i < loop_boundaries.end; i += blockDim.y * blockDim.x) { - closure(i, val); - } + for (iType i = + loop_boundaries.start + threadIdx.y * blockDim.x + threadIdx.x; + i < loop_boundaries.end; + i += blockDim.y * blockDim.x) { closure(i, value); } - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- @@ -706,14 +761,26 @@ parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { #ifdef __HIP_DEVICE_COMPILE__ - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; i += blockDim.x) { - closure(i, reducer.reference()); + closure(i, value); } - Impl::HIPTeamMember::vector_reduce(reducer); + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; #else (void)loop_boundaries; (void)closure; @@ -737,20 +804,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::HIPTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { -#ifdef __HIP_DEVICE_COMPILE__ - result = ValueType(); + KOKKOS_IF_ON_DEVICE( + (using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - for (iType i = loop_boundaries.start + threadIdx.x; i < loop_boundaries.end; - i += blockDim.x) { - closure(i, result); - } + wrapped_reducer_type wrapped_reducer(closure); value_type value; + wrapped_reducer.init(&value); - Impl::HIPTeamMember::vector_reduce(Kokkos::Sum(result)); -#else - (void)loop_boundaries; - (void)closure; - (void)result; -#endif + for (iType i = loop_boundaries.start + threadIdx.x; + i < loop_boundaries.end; i += blockDim.x) { closure(i, value); } + + Impl::HIPTeamMember::impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); result = value;)) + + KOKKOS_IF_ON_HOST(((void)loop_boundaries; (void)closure; (void)result;)) } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp index 67e1181125..f21c65f16d 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_TeamPolicyInternal.hpp @@ -222,7 +222,8 @@ class TeamPolicyInternal m_tune_team_size(bool(team_size_request <= 0)), m_tune_vector_length(bool(vector_length_request <= 0)) { // Make sure league size is permissible - if (league_size_ >= static_cast(hip_internal_maximum_grid_count()[0])) + const int max_grid_size_x = m_space.hip_device_prop().maxGridSize[0]; + if (league_size_ >= max_grid_size_x) Impl::throw_runtime_exception( "Requested too large league_size for TeamPolicy on HIP execution " "space."); diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp index 30774c898b..f5b1d321e8 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_Vectorization.hpp @@ -40,8 +40,8 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { using shfl_type = int; union conv_type { Scalar orig; @@ -65,16 +65,16 @@ struct in_place_shfl_op { template // requires _assignable_from_bits __device__ inline std::enable_if_t operator()( - Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( reinterpret_cast(in), lane_or_delta, width); } template __device__ inline std::enable_if_t - operator()(Scalar& out, Scalar const& in, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, Scalar const& in, int lane_or_delta, + int width) const noexcept { reinterpret_cast(out) = self().do_shfl_op( *reinterpret_cast(&in), lane_or_delta, width); } @@ -82,8 +82,8 @@ struct in_place_shfl_op { // sizeof(Scalar) > sizeof(double) case template __device__ inline std::enable_if_t<(sizeof(Scalar) > sizeof(double))> - operator()(Scalar& out, const Scalar& val, int lane_or_delta, int width) const - noexcept { + operator()(Scalar& out, const Scalar& val, int lane_or_delta, + int width) const noexcept { using shuffle_as_t = int; constexpr int N = sizeof(Scalar) / sizeof(shuffle_as_t); @@ -108,7 +108,7 @@ struct in_place_shfl_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl(Args&&... args) noexcept { - in_place_shfl_fn{}((Args &&) args...); + in_place_shfl_fn{}((Args&&)args...); } struct in_place_shfl_up_fn : in_place_shfl_op { @@ -123,7 +123,7 @@ struct in_place_shfl_up_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_up( Args&&... args) noexcept { - in_place_shfl_up_fn{}((Args &&) args...); + in_place_shfl_up_fn{}((Args&&)args...); } struct in_place_shfl_down_fn : in_place_shfl_op { @@ -138,7 +138,7 @@ struct in_place_shfl_down_fn : in_place_shfl_op { template __device__ KOKKOS_IMPL_FORCEINLINE void in_place_shfl_down( Args&&... args) noexcept { - in_place_shfl_down_fn{}((Args &&) args...); + in_place_shfl_down_fn{}((Args&&)args...); } } // namespace Impl diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp new file mode 100644 index 0000000000..34d5ecf1a6 --- /dev/null +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.cpp @@ -0,0 +1,36 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#define KOKKOS_IMPL_PUBLIC_INCLUDE +#endif + +#include +#include + +namespace Kokkos { +namespace Impl { + +// alternative to hipMemsetAsync, which sets the first `cnt` bytes of `dst` to 0 +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt) { + Kokkos::parallel_for( + "Kokkos::ZeroMemset via parallel_for", + Kokkos::RangePolicy(exec_space, 0, cnt), + KOKKOS_LAMBDA(size_t i) { static_cast(dst)[i] = 0; }); +} + +} // namespace Impl +} // namespace Kokkos diff --git a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp index 4bca29868f..18708cf8c5 100644 --- a/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp +++ b/lib/kokkos/core/src/HIP/Kokkos_HIP_ZeroMemset.hpp @@ -23,12 +23,21 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const HIP& exec_space, const View& dst) { - KOKKOS_IMPL_HIP_SAFE_CALL(hipMemsetAsync( - dst.data(), 0, dst.size() * sizeof(typename View::value_type), - exec_space.hip_stream())); +// hipMemsetAsync sets the first `cnt` bytes of `dst` to the provided value +void zero_with_hip_kernel(const HIP& exec_space, void* dst, size_t cnt); + +template <> +struct ZeroMemset { + ZeroMemset(const HIP& exec_space, void* dst, size_t cnt) { + // in ROCm <= 6.2.0, hipMemsetAsync on a host-allocated pointer + // returns an invalid value error, but accessing the data via a + // GPU kernel works. +#if defined(KOKKOS_IMPL_HIP_UNIFIED_MEMORY) + zero_with_hip_kernel(exec_space, dst, cnt); +#else + KOKKOS_IMPL_HIP_SAFE_CALL( + hipMemsetAsync(dst, 0, cnt, exec_space.hip_stream())); +#endif } }; diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp index 245dc128ca..7d49933790 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX.hpp @@ -32,12 +32,10 @@ static_assert(false, #include #include #include -#include #include #include #include #include -#include #include #include @@ -75,12 +73,12 @@ class hpx_thread_buffer { } public: - hpx_thread_buffer() = default; - ~hpx_thread_buffer() = default; - hpx_thread_buffer(const hpx_thread_buffer &) = delete; - hpx_thread_buffer(hpx_thread_buffer &&) = delete; + hpx_thread_buffer() = default; + ~hpx_thread_buffer() = default; + hpx_thread_buffer(const hpx_thread_buffer &) = delete; + hpx_thread_buffer(hpx_thread_buffer &&) = delete; hpx_thread_buffer &operator=(const hpx_thread_buffer &) = delete; - hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; + hpx_thread_buffer &operator=(hpx_thread_buffer) = delete; void resize(const std::size_t num_threads, const std::size_t size_per_thread, const std::size_t extra_space = 0) noexcept; @@ -140,10 +138,10 @@ class HPX { hpx::execution::experimental::unique_any_sender<> &&sender) : m_instance_id(instance_id), m_sender{std::move(sender)} {} - instance_data(const instance_data &) = delete; - instance_data(instance_data &&) = delete; + instance_data(const instance_data &) = delete; + instance_data(instance_data &&) = delete; instance_data &operator=(const instance_data &) = delete; - instance_data &operator=(instance_data) = delete; + instance_data &operator=(instance_data) = delete; uint32_t m_instance_id{HPX::impl_default_instance_id()}; hpx::execution::experimental::unique_any_sender<> m_sender{ @@ -196,7 +194,7 @@ class HPX { HPX(HPX &&other) = default; HPX(const HPX &other) = default; - HPX &operator=(HPX &&) = default; + HPX &operator=(HPX &&) = default; HPX &operator=(const HPX &) = default; void print_configuration(std::ostream &os, bool /*verbose*/ = false) const; @@ -214,9 +212,9 @@ class HPX { struct impl_in_parallel_scope { impl_in_parallel_scope() noexcept; ~impl_in_parallel_scope() noexcept; - impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; - impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; - impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope &&) = delete; + impl_in_parallel_scope(impl_in_parallel_scope const &) = delete; + impl_in_parallel_scope &operator=(impl_in_parallel_scope &&) = delete; impl_in_parallel_scope &operator=(impl_in_parallel_scope const &) = delete; }; @@ -249,13 +247,15 @@ class HPX { impl_instance_fence(name); } - static bool is_asynchronous(HPX const & = HPX()) noexcept { +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + KOKKOS_DEPRECATED static bool is_asynchronous(HPX const & = HPX()) noexcept { #if defined(KOKKOS_ENABLE_IMPL_HPX_ASYNC_DISPATCH) return true; #else return false; #endif } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(); @@ -281,8 +281,8 @@ class HPX { return impl_get_instance_data().m_buffer; } - hpx::execution::experimental::unique_any_sender<> &impl_get_sender() const - noexcept { + hpx::execution::experimental::unique_any_sender<> &impl_get_sender() + const noexcept { return impl_get_instance_data().m_sender; } @@ -447,6 +447,20 @@ class HPX { } }; +template +std::vector partition_space(HPX const &, Args... args) { + std::vector instances(sizeof...(args)); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + +template +std::vector partition_space(HPX const &, std::vector const &weights) { + std::vector instances(weights.size()); + for (auto &in : instances) in = HPX(HPX::instance_mode::independent); + return instances; +} + extern template void HPX::impl_bulk_plain_erased( bool, bool, std::function &&, int const, hpx::threads::thread_stacksize stacksize) const; @@ -1772,11 +1786,24 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamThreadRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } /** \brief Intra-thread vector parallel_for. Executes lambda(iType i) for each @@ -1810,14 +1837,26 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::ThreadVectorRangeBoundariesStruct &loop_boundaries, const Lambda &lambda, ValueType &result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + wrapped_reducer.final(&value); + result = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template &loop_boundaries, const Lambda &lambda, const ReducerType &reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + #ifdef KOKKOS_ENABLE_PRAGMA_IVDEP #pragma ivdep #endif for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -1995,7 +2060,9 @@ KOKKOS_INLINE_FUNCTION void single( } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #endif /* #if defined( KOKKOS_ENABLE_HPX ) */ #endif /* #ifndef KOKKOS_HPX_HPP */ diff --git a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp index 28c75b2515..d775b7fac3 100644 --- a/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp +++ b/lib/kokkos/core/src/HPX/Kokkos_HPX_Task.hpp @@ -25,6 +25,8 @@ #include +#include + #include #include @@ -33,6 +35,11 @@ //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -256,6 +263,10 @@ extern template class TaskQueue< } // namespace Impl } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp index 297b1fadee..92dc506c5e 100644 --- a/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/KokkosExp_MDRangePolicy.hpp @@ -30,6 +30,7 @@ static_assert(false, #include #include #include +#include namespace Kokkos { @@ -60,13 +61,13 @@ namespace Impl { // NOTE the comparison below is encapsulated to silent warnings about pointless // comparison of unsigned integer with zero template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T) { return false; } template -constexpr std::enable_if_t::value, bool> +constexpr std::enable_if_t, bool> is_less_than_value_initialized_variable(T arg) { return arg < T{}; } @@ -75,7 +76,7 @@ is_less_than_value_initialized_variable(T arg) { template constexpr To checked_narrow_cast(From arg, std::size_t idx) { constexpr const bool is_different_signedness = - (std::is_signed::value != std::is_signed::value); + (std::is_signed_v != std::is_signed_v); auto const ret = static_cast(arg); if (static_cast(ret) != arg || (is_different_signedness && @@ -183,7 +184,7 @@ struct MDRangePolicy template friend struct MDRangePolicy; - static_assert(!std::is_void::value, + static_assert(!std::is_void_v, "Kokkos Error: MD iteration pattern not defined"); using iteration_pattern = typename traits::iteration_pattern; @@ -238,9 +239,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) : MDRangePolicy( @@ -257,9 +258,9 @@ struct MDRangePolicy template ::value && - std::is_integral::value && - std::is_integral::value>> + typename = std::enable_if_t && + std::is_integral_v && + std::is_integral_v>> MDRangePolicy(const typename traits::execution_space& work_space, const LT (&lower)[LN], const UT (&upper)[UN], const TT (&tile)[TN] = {}) @@ -291,14 +292,14 @@ struct MDRangePolicy } template ::value>> + typename = std::enable_if_t>> MDRangePolicy(Kokkos::Array const& lower, Kokkos::Array const& upper, Kokkos::Array const& tile = Kokkos::Array{}) : MDRangePolicy(typename traits::execution_space(), lower, upper, tile) {} template ::value>> + typename = std::enable_if_t>> MDRangePolicy(const typename traits::execution_space& work_space, Kokkos::Array const& lower, Kokkos::Array const& upper, @@ -330,7 +331,44 @@ struct MDRangePolicy } bool impl_tune_tile_size() const { return m_tune_tile_size; } + tile_type tile_size_recommended() const { + tile_type rec_tile_sizes = {}; + + for (std::size_t i = 0; i < rec_tile_sizes.size(); ++i) { + rec_tile_sizes[i] = tile_size_recommended(i); + } + return rec_tile_sizes; + } + + int max_total_tile_size() const { + return Impl::get_tile_size_properties(m_space).max_total_tile_size; + } + private: + int tile_size_recommended(const int tile_rank) const { + auto properties = Impl::get_tile_size_properties(m_space); + int last_rank = (inner_direction == Iterate::Right) ? rank - 1 : 0; + int rank_acc = + (inner_direction == Iterate::Right) ? tile_rank + 1 : tile_rank - 1; + int rec_tile_size = (std::pow(properties.default_tile_size, rank_acc) < + properties.max_total_tile_size) + ? properties.default_tile_size + : 1; + + if (tile_rank == last_rank) { + rec_tile_size = tile_size_last_rank( + properties, m_upper[last_rank] - m_lower[last_rank]); + } + return rec_tile_size; + } + + int tile_size_last_rank(const Impl::TileSizeProperties properties, + const index_type length) const { + return properties.default_largest_tile_size == 0 + ? std::max(length, 1) + : properties.default_largest_tile_size; + } + void init_helper(Impl::TileSizeProperties properties) { m_prod_tile_dims = 1; int increment = 1; @@ -341,6 +379,7 @@ struct MDRangePolicy rank_start = rank - 1; rank_end = -1; } + for (int i = rank_start; i != rank_end; i += increment) { const index_type length = m_upper[i] - m_lower[i]; @@ -368,9 +407,7 @@ struct MDRangePolicy m_tile[i] = 1; } } else { - m_tile[i] = properties.default_largest_tile_size == 0 - ? std::max(length, 1) - : properties.default_largest_tile_size; + m_tile[i] = tile_size_last_rank(properties, length); } } m_tile_end[i] = @@ -389,58 +426,55 @@ struct MDRangePolicy }; template -MDRangePolicy(const LT (&)[N], const UT (&)[N])->MDRangePolicy>; +MDRangePolicy(const LT (&)[N], const UT (&)[N]) -> MDRangePolicy>; template MDRangePolicy(const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, const LT (&)[N], const UT (&)[N], - const TT (&)[TN]) - ->MDRangePolicy>; + const TT (&)[TN]) -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N]) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, const LT (&)[N], const UT (&)[N], const TT (&)[TN]) - ->MDRangePolicy>; + -> MDRangePolicy>; template -MDRangePolicy(Array const&, Array const&)->MDRangePolicy>; +MDRangePolicy(Array const&, Array const&) -> MDRangePolicy>; template MDRangePolicy(Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; template MDRangePolicy(DefaultExecutionSpace const&, Array const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&) - ->MDRangePolicy>; + -> MDRangePolicy>; template >> MDRangePolicy(ES const&, Array const&, Array const&, - Array const&) - ->MDRangePolicy>; + Array const&) -> MDRangePolicy>; } // namespace Kokkos diff --git a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp index 9f5deed5d6..62f527aa02 100644 --- a/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_AnonymousSpace.hpp @@ -41,10 +41,10 @@ class AnonymousSpace { using device_type = Kokkos::Device; /**\brief Default memory space instance */ - AnonymousSpace() = default; - AnonymousSpace(AnonymousSpace &&rhs) = default; - AnonymousSpace(const AnonymousSpace &rhs) = default; - AnonymousSpace &operator=(AnonymousSpace &&) = default; + AnonymousSpace() = default; + AnonymousSpace(AnonymousSpace &&rhs) = default; + AnonymousSpace(const AnonymousSpace &rhs) = default; + AnonymousSpace &operator=(AnonymousSpace &&) = default; AnonymousSpace &operator=(const AnonymousSpace &) = default; ~AnonymousSpace() = default; diff --git a/lib/kokkos/core/src/Kokkos_Array.hpp b/lib/kokkos/core/src/Kokkos_Array.hpp index 4d905fbc55..493536b53b 100644 --- a/lib/kokkos/core/src/Kokkos_Array.hpp +++ b/lib/kokkos/core/src/Kokkos_Array.hpp @@ -35,7 +35,7 @@ namespace Kokkos { #ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK namespace Impl { -template ::value> +template > struct ArrayBoundsCheck; template @@ -195,8 +195,10 @@ struct Array { return *reinterpret_cast(-1); } - KOKKOS_INLINE_FUNCTION pointer data() { return nullptr; } - KOKKOS_INLINE_FUNCTION const_pointer data() const { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr pointer data() { return nullptr; } + KOKKOS_INLINE_FUNCTION constexpr const_pointer data() const { + return nullptr; + } friend KOKKOS_FUNCTION constexpr bool operator==(Array const&, Array const&) noexcept { @@ -365,7 +367,7 @@ struct KOKKOS_DEPRECATED #endif template -Array(T, Us...)->Array; +Array(T, Us...) -> Array; namespace Impl { @@ -377,7 +379,7 @@ KOKKOS_FUNCTION constexpr Array, N> to_array_impl( template KOKKOS_FUNCTION constexpr Array, N> to_array_impl( - T(&&a)[N], std::index_sequence) { + T (&&a)[N], std::index_sequence) { return {{std::move(a[I])...}}; } @@ -389,7 +391,7 @@ KOKKOS_FUNCTION constexpr auto to_array(T (&a)[N]) { } template -KOKKOS_FUNCTION constexpr auto to_array(T(&&a)[N]) { +KOKKOS_FUNCTION constexpr auto to_array(T (&&a)[N]) { return Impl::to_array_impl(std::move(a), std::make_index_sequence{}); } @@ -435,6 +437,32 @@ KOKKOS_FUNCTION constexpr T const&& get(Array const&& a) noexcept { } // namespace Kokkos // +// +namespace Kokkos { + +template +KOKKOS_FUNCTION constexpr T const* begin(Array const& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T* begin(Array& a) noexcept { + return a.data(); +} + +template +KOKKOS_FUNCTION constexpr T const* end(Array const& a) noexcept { + return a.data() + a.size(); +} + +template +KOKKOS_FUNCTION constexpr T* end(Array& a) noexcept { + return a.data() + a.size(); +} + +} // namespace Kokkos +// + #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY #undef KOKKOS_IMPL_PUBLIC_INCLUDE #undef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ARRAY diff --git a/lib/kokkos/core/src/Kokkos_Atomic.hpp b/lib/kokkos/core/src/Kokkos_Atomic.hpp index 6fc903f274..ba61136092 100644 --- a/lib/kokkos/core/src/Kokkos_Atomic.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomic.hpp @@ -47,7 +47,6 @@ #include #include -#include #ifdef KOKKOS_IMPL_PUBLIC_INCLUDE_NOTDEFINED_ATOMIC #undef KOKKOS_IMPL_PUBLIC_INCLUDE diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp deleted file mode 100644 index bf57dcae65..0000000000 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Volatile_Wrapper.hpp +++ /dev/null @@ -1,196 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#include -static_assert(false, - "Including non-public Kokkos header files is not allowed."); -#endif -#ifndef KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#define KOKKOS_DESUL_ATOMICS_VOLATILE_WRAPPER_HPP_ -#include -#include - -#ifdef KOKKOS_ENABLE_ATOMICS_BYPASS -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeCaller() -#else -#define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() -#endif - -// clang-format off -namespace Kokkos { - -template KOKKOS_INLINE_FUNCTION -T atomic_load(volatile T* const dest) { return desul::atomic_load(const_cast(dest), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(volatile T* const dest) { return desul::atomic_fetch_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(volatile T* const dest) { return desul::atomic_fetch_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(volatile T* const dest) { return desul::atomic_inc_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(volatile T* const dest) { return desul::atomic_dec_fetch(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or (volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(volatile T* const dest) { return desul::atomic_inc(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(volatile T* const dest) { return desul::atomic_dec(const_cast(dest),desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(volatile T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(const_cast(dest), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(volatile T* const dest, T& expected, const T desired) { - return desul::atomic_compare_exchange_strong(const_cast(dest),expected, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(volatile T* const dest, const T compare, const T desired) { - return desul::atomic_compare_exchange(const_cast(dest),compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -} -#undef KOKKOS_DESUL_MEM_SCOPE - -// clang-format on -#endif diff --git a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp index 26db69ac1f..40f51c5a33 100644 --- a/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp +++ b/lib/kokkos/core/src/Kokkos_Atomics_Desul_Wrapper.hpp @@ -24,14 +24,16 @@ static_assert(false, #include #include +#include // identity_type #include -// clang-format off namespace Kokkos { -// FIXME: These functions don't have any use/test in unit tests ... -// ========================================================== -inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +KOKKOS_DEPRECATED inline const char* atomic_query_version() { + return "KOKKOS_DESUL_ATOMICS"; +} +#endif #if defined(KOKKOS_COMPILER_GNU) && !defined(__PGIC__) && \ !defined(__CUDA_ARCH__) @@ -53,197 +55,120 @@ inline const char* atomic_query_version() { return "KOKKOS_DESUL_ATOMICS"; } #define KOKKOS_DESUL_MEM_SCOPE desul::MemoryScopeDevice() #endif -template KOKKOS_INLINE_FUNCTION -T atomic_load(T* const dest) { return desul::atomic_load(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_store(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_store(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_assign(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { atomic_store(dest,val); } - -KOKKOS_INLINE_FUNCTION -void memory_fence() { - desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); -} - -KOKKOS_INLINE_FUNCTION -void load_fence() { return desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } - -KOKKOS_INLINE_FUNCTION -void store_fence() { return desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } - -// atomic_fetch_op -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_add (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_sub (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_max (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_min (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mul (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_div (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_mod (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_mod (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_and (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_or (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_xor (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_xor (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_nand(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_nand(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_lshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_lshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_rshift(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_fetch_rshift(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_inc(T* const dest) { return desul::atomic_fetch_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_fetch_dec(T* const dest) { return desul::atomic_fetch_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op_fetch -template KOKKOS_INLINE_FUNCTION -T atomic_add_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_sub_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_max_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_min_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mul_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_div_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_mod_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mod_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_and_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_and_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_or_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_or_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_xor_fetch (T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_xor_fetch (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_nand_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_nand_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_lshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_lshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_rshift_fetch(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_rshift_fetch(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_inc_fetch(T* const dest) { return desul::atomic_inc_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -T atomic_dec_fetch(T* const dest) { return desul::atomic_dec_fetch(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - - -// atomic_op -template KOKKOS_INLINE_FUNCTION -void atomic_add(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_add (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_sub(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_sub (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_mul(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_mul (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_div(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_div (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_min(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_min (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_max(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_max (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_and yet so call fetch_and -template KOKKOS_INLINE_FUNCTION -void atomic_and(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_and (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// FIXME: Desul doesn't have atomic_or yet so call fetch_or -template KOKKOS_INLINE_FUNCTION -void atomic_or(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { (void) desul::atomic_fetch_or (dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_inc(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_dec(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_increment(T* const dest) { return desul::atomic_inc(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -void atomic_decrement(T* const dest) { return desul::atomic_dec(dest, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -// Exchange - -template KOKKOS_INLINE_FUNCTION -T atomic_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t val) { return desul::atomic_exchange(dest, val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } - -template KOKKOS_INLINE_FUNCTION -bool atomic_compare_exchange_strong(T* const dest, desul::Impl::dont_deduce_this_parameter_t expected, desul::Impl::dont_deduce_this_parameter_t desired) { - T expected_ref = expected; - return desul::atomic_compare_exchange_strong(dest, expected_ref, desired, - desul::MemoryOrderRelaxed(), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - -template KOKKOS_INLINE_FUNCTION -T atomic_compare_exchange(T* const dest, desul::Impl::dont_deduce_this_parameter_t compare, desul::Impl::dont_deduce_this_parameter_t desired) { - return desul::atomic_compare_exchange(dest, compare, desired, - desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); -} - namespace Impl { - template KOKKOS_INLINE_FUNCTION - bool atomic_compare_exchange_strong(T* const dest, T& expected, const T desired, MemOrderSuccess succ, MemOrderFailure fail) { - return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, fail, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - T atomic_load(const T* const src, MemoryOrder order) { - return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); - } - template - KOKKOS_INLINE_FUNCTION - void atomic_store(T* const src, const T val, MemoryOrder order) { - return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); - } +template +using not_deduced_atomic_t = + std::add_const_t>>; + +template +using enable_if_atomic_t = + std::enable_if_t && !std::is_const_v, + std::remove_volatile_t>; } // namespace Impl +// clang-format off + +// fences +KOKKOS_INLINE_FUNCTION void memory_fence() { desul::atomic_thread_fence(desul::MemoryOrderSeqCst(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void load_fence() { desul::atomic_thread_fence(desul::MemoryOrderAcquire(), KOKKOS_DESUL_MEM_SCOPE); } +KOKKOS_INLINE_FUNCTION void store_fence() { desul::atomic_thread_fence(desul::MemoryOrderRelease(), KOKKOS_DESUL_MEM_SCOPE); } + +// load/store +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_load (T const* ptr) { return desul::atomic_load (const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_store(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_store(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_store() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_assign(T* ptr, Impl::not_deduced_atomic_t val) { atomic_store(ptr, val); } +#endif + +// atomic_fetch_op +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_add(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_sub(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_max(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_min(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mul(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_div(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_mod(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_and(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_or (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_xor(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_nand(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_nand(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_lshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_rshift(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_inc(T* ptr) { return desul::atomic_fetch_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_fetch_dec(T* ptr) { return desul::atomic_fetch_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } + +// atomic_op_fetch +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_add_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_sub_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_max_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_min_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mul_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_div_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_mod_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_and_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or_fetch (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_or_fetch (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_xor_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_lshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift_fetch(T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_rshift_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc_fetch(T* ptr) { return desul::atomic_inc_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec_fetch(T* ptr) { return desul::atomic_dec_fetch(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } + +// atomic_op +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_add(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_add(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_sub(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_sub(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_max(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_max(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_min(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_min(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mul(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_mul(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_div(T* ptr, Impl::not_deduced_atomic_t val) { desul::atomic_div(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_mod(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_mod(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_and(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_and(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_or (T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_or (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_xor(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_xor(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_nand(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_nand_fetch(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_lshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_lshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_rshift(T* ptr, Impl::not_deduced_atomic_t val) { (void)desul::atomic_fetch_rshift(const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_inc(T* ptr) { desul::atomic_inc(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_dec(T* ptr) { desul::atomic_dec(const_cast*>(ptr), desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_inc() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_increment(T* ptr) { atomic_inc(ptr); } +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_dec() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_decrement(T* ptr) { atomic_dec(ptr); } +#endif + +// exchange +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_exchange (T* ptr, Impl::not_deduced_atomic_t val) { return desul::atomic_exchange (const_cast*>(ptr), val, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +template KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return desul::atomic_compare_exchange(const_cast*>(ptr), expected, desired, desul::MemoryOrderRelaxed(), KOKKOS_DESUL_MEM_SCOPE); } +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template KOKKOS_DEPRECATED_WITH_COMMENT("Use atomic_compare_exchange() instead!") KOKKOS_FUNCTION Impl::enable_if_atomic_t atomic_compare_exchange_strong(T* ptr, Impl::not_deduced_atomic_t expected, Impl::not_deduced_atomic_t desired) { return expected == atomic_compare_exchange(ptr, expected, desired); } +#endif + +// clang-format on } // namespace Kokkos +namespace Kokkos::Impl { + +template +KOKKOS_FUNCTION bool atomic_compare_exchange_strong(T* const dest, T& expected, + const T desired, + MemOrderSuccess succ, + MemOrderFailure fail) { + return desul::atomic_compare_exchange_strong(dest, expected, desired, succ, + fail, KOKKOS_DESUL_MEM_SCOPE); +} + +template +KOKKOS_FUNCTION T atomic_load(const T* const src, MemoryOrder order) { + return desul::atomic_load(src, order, KOKKOS_DESUL_MEM_SCOPE); +} + +template +KOKKOS_FUNCTION void atomic_store(T* const src, const T val, + MemoryOrder order) { + return desul::atomic_store(src, val, order, KOKKOS_DESUL_MEM_SCOPE); +} + +} // namespace Kokkos::Impl + #undef KOKKOS_DESUL_MEM_SCOPE -// clang-format on #endif diff --git a/lib/kokkos/core/src/Kokkos_Complex.hpp b/lib/kokkos/core/src/Kokkos_Complex.hpp index 7dd2a9ddbb..8233c30b24 100644 --- a/lib/kokkos/core/src/Kokkos_Complex.hpp +++ b/lib/kokkos/core/src/Kokkos_Complex.hpp @@ -70,9 +70,8 @@ class complex& operator=(const complex&) noexcept = default; /// \brief Conversion constructor from compatible RType - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_INLINE_FUNCTION complex(const complex& other) noexcept // Intentionally do the conversions implicitly here so that users don't // get any warnings about narrowing, etc., that they would expect to get @@ -265,9 +264,8 @@ class #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 //! Copy constructor from volatile. - template < - class RType, - std::enable_if_t::value, int> = 0> + template , int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex(const volatile complex& src) noexcept // Intentionally do the conversions implicitly here so that users don't @@ -296,7 +294,7 @@ class // vl = r; // vl = cr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION void operator=( const Complex& src) volatile noexcept { re_ = src.re_; @@ -319,7 +317,7 @@ class // vl = vr; // vl = cvr; template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION volatile complex& operator=( const volatile Complex& src) volatile noexcept { re_ = src.re_; @@ -341,7 +339,7 @@ class // l = cvr; // template ::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION complex& operator=( const volatile Complex& src) noexcept { re_ = src.re_; @@ -539,7 +537,7 @@ inline bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -551,7 +549,7 @@ KOKKOS_INLINE_FUNCTION bool operator==(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator==(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -590,7 +588,7 @@ inline bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, RealType2 const& y) noexcept { using common_type = std::common_type_t; @@ -602,7 +600,7 @@ KOKKOS_INLINE_FUNCTION bool operator!=(complex const& x, template < class RealType1, class RealType2, // Constraints to avoid participation in oparator==() for every possible RHS - std::enable_if_t::value, int> = 0> + std::enable_if_t, int> = 0> KOKKOS_INLINE_FUNCTION bool operator!=(RealType1 const& x, complex const& y) noexcept { using common_type = std::common_type_t; @@ -778,16 +776,14 @@ KOKKOS_INLINE_FUNCTION complex pow(const complex& x, return x == T() ? T() : exp(y * log(x)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow( const T& x, const complex& y) { using type = Impl::promote_2_t; return pow(type(x), complex(y)); } -template ::value>> +template >> KOKKOS_INLINE_FUNCTION complex> pow(const complex& x, const U& y) { using type = Impl::promote_2_t; diff --git a/lib/kokkos/core/src/Kokkos_Concepts.hpp b/lib/kokkos/core/src/Kokkos_Concepts.hpp index df78a644a0..0bfb9eb5fa 100644 --- a/lib/kokkos/core/src/Kokkos_Concepts.hpp +++ b/lib/kokkos/core/src/Kokkos_Concepts.hpp @@ -41,8 +41,7 @@ struct Dynamic {}; // Schedule Wrapper Type template struct Schedule { - static_assert(std::is_same::value || - std::is_same::value, + static_assert(std::is_same_v || std::is_same_v, "Kokkos: Invalid Schedule<> type."); using schedule_type = Schedule; using type = T; @@ -51,7 +50,7 @@ struct Schedule { // Specify Iteration Index Type template struct IndexType { - static_assert(std::is_integral::value, "Kokkos: Invalid IndexType<>."); + static_assert(std::is_integral_v, "Kokkos: Invalid IndexType<>."); using index_type = IndexType; using type = T; }; @@ -139,8 +138,8 @@ namespace Kokkos { \ public: \ static constexpr bool value = \ - std::is_base_of, T>::value || \ - std::is_base_of, T>::value; \ + std::is_base_of_v, T> || \ + std::is_base_of_v, T>; \ constexpr operator bool() const noexcept { return value; } \ }; \ template \ @@ -292,44 +291,6 @@ struct is_space { using execution_space = typename is_exe::space; using memory_space = typename is_mem::space; - - // For backward compatibility, deprecated in favor of - // Kokkos::Impl::HostMirror::host_mirror_space - - private: - // The actual definitions for host_memory_space and host_execution_spaces are - // in do_not_use_host_memory_space and do_not_use_host_execution_space to be - // able to use them within this class without deprecation warnings. - using do_not_use_host_memory_space = std::conditional_t< - std::is_same::value -#if defined(KOKKOS_ENABLE_CUDA) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_HIP) - || std::is_same::value || - std::is_same::value -#elif defined(KOKKOS_ENABLE_SYCL) - || std::is_same::value || - std::is_same::value -#endif - , - memory_space, Kokkos::HostSpace>; - - using do_not_use_host_execution_space = std::conditional_t< -#if defined(KOKKOS_ENABLE_CUDA) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_HIP) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_SYCL) - std::is_same::value || -#elif defined(KOKKOS_ENABLE_OPENMPTARGET) - std::is_same::value || -#endif - false, - Kokkos::DefaultHostExecutionSpace, execution_space>; }; } // namespace Kokkos @@ -357,7 +318,7 @@ struct MemorySpaceAccess { * 2. All execution spaces that can access DstMemorySpace can also access * SrcMemorySpace. */ - enum { assignable = std::is_same::value }; + enum { assignable = std::is_same_v }; /**\brief For all DstExecSpace::memory_space == DstMemorySpace * DstExecSpace can access SrcMemorySpace. @@ -442,7 +403,7 @@ struct SpaceAccessibility { // If same memory space or not accessible use the AccessSpace // else construct a device with execution space and memory space. using space = std::conditional_t< - std::is_same::value || + std::is_same_v || !exe_access::accessible, AccessSpace, Kokkos::Device>; diff --git a/lib/kokkos/core/src/Kokkos_CopyViews.hpp b/lib/kokkos/core/src/Kokkos_CopyViews.hpp index e856b19247..7da59aa4e4 100644 --- a/lib/kokkos/core/src/Kokkos_CopyViews.hpp +++ b/lib/kokkos/core/src/Kokkos_CopyViews.hpp @@ -561,21 +561,20 @@ void view_copy(const ExecutionSpace& space, const DstType& dst, int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -649,21 +648,20 @@ void view_copy(const DstType& dst, const SrcType& src) { int64_t strides[DstType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[DstType::rank - 1]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1350,22 +1348,20 @@ inline void contiguous_fill( } // Default implementation for execution spaces that don't provide a definition -template +template struct ZeroMemset { - ZeroMemset(const ExecutionSpace& exec_space, const ViewType& dst) { - using ValueType = typename ViewType::value_type; - alignas(alignof(ValueType)) unsigned char - zero_initialized_storage[sizeof(ValueType)] = {}; - contiguous_fill(exec_space, dst, - *reinterpret_cast(zero_initialized_storage)); + ZeroMemset(const ExecutionSpace& exec_space, void* dst, size_t cnt) { + contiguous_fill( + exec_space, + Kokkos::View( + static_cast(dst), cnt), + std::byte{}); } }; template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1375,20 +1371,20 @@ contiguous_fill_or_memset( && !std::is_same_v #endif ) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset>(exec_space, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec_space, dst.data(), + dst.size() * sizeof(typename ViewTraits::value_type)); else contiguous_fill(exec_space, dst, value); } template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const ExecutionSpace& exec_space, const View& dst, typename ViewTraits::const_value_type& value) { @@ -1397,9 +1393,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value> + std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1411,11 +1405,12 @@ contiguous_fill_or_memset( // leading to the significant performance issues #ifndef KOKKOS_ARCH_A64FX if (Impl::is_zero_byte(value)) - // FIXME intel/19 icpc fails to deduce template parameters here, + // FIXME intel/19 icpc fails to deduce template parameter here, // resulting in compilation errors; explicitly passing the template - // parameters to ZeroMemset helps workaround the issue - // See https://github.com/kokkos/kokkos/issues/6775 - ZeroMemset(exec, dst); + // parameter to ZeroMemset helps workaround the issue. + // See https://github.com/kokkos/kokkos/issues/7273. + ZeroMemset( + exec, dst.data(), dst.size() * sizeof(typename ViewType::value_type)); else #endif contiguous_fill(exec, dst, value); @@ -1423,9 +1418,7 @@ contiguous_fill_or_memset( template inline std::enable_if_t< - !(std::is_trivial::value_type>::value && - std::is_trivially_copy_assignable< - typename ViewTraits::value_type>::value)> + !std::is_trivial_v::value_type>> contiguous_fill_or_memset( const View& dst, typename ViewTraits::const_value_type& value) { @@ -1441,8 +1434,8 @@ template inline void deep_copy( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using ViewType = View; using exec_space_type = typename ViewType::execution_space; @@ -1464,8 +1457,8 @@ inline void deep_copy( } Kokkos::fence("Kokkos::deep_copy: scalar copy, pre copy fence"); - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); // If contiguous we can simply do a 1D flat loop or use memset @@ -1482,21 +1475,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -1539,8 +1531,8 @@ template inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; @@ -1576,8 +1568,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using dst_type = View; @@ -1587,8 +1579,8 @@ inline void deep_copy( using dst_memory_space = typename dst_type::memory_space; using src_memory_space = typename src_type::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -1628,8 +1620,8 @@ template inline void deep_copy( const View& dst, const View& src, std::enable_if_t< - (std::is_void::specialize>::value && - std::is_void::specialize>::value && + (std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; @@ -1641,8 +1633,8 @@ inline void deep_copy( using dst_value_type = typename dst_type::value_type; using src_value_type = typename src_type::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -1772,10 +1764,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2191,8 +2183,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const TeamType& team, const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { Kokkos::parallel_for(Kokkos::TeamVectorRange(team, dst.span()), [&](const int& i) { dst.data()[i] = value; }); } @@ -2201,8 +2193,8 @@ template void KOKKOS_INLINE_FUNCTION local_deep_copy_contiguous( const View& dst, typename ViewTraits::const_value_type& value, - std::enable_if_t::specialize, - void>::value>* = nullptr) { + std::enable_if_t::specialize, + void>>* = nullptr) { for (size_t i = 0; i < dst.span(); ++i) { dst.data()[i] = value; } @@ -2568,13 +2560,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2594,21 +2586,20 @@ inline void deep_copy( int64_t strides[ViewType::rank + 1]; dst.stride(strides); Kokkos::Iterate iterate; - if (std::is_same::value) { + if (std::is_same_v) { iterate = Kokkos::Iterate::Right; - } else if (std::is_same::value) { + } else if (std::is_same_v) { iterate = Kokkos::Iterate::Left; - } else if (std::is_same::value) { + } else if (std::is_same_v) { if (strides[0] > strides[ViewType::rank > 0 ? ViewType::rank - 1 : 0]) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; } else { - if (std::is_same::value) + if (std::is_same_v) iterate = Kokkos::Iterate::Right; else iterate = Kokkos::Iterate::Left; @@ -2649,13 +2640,13 @@ inline void deep_copy( typename ViewTraits::const_value_type& value, std::enable_if_t< Kokkos::is_execution_space::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && !Kokkos::SpaceAccessibility:: memory_space>::accessible>* = nullptr) { using dst_traits = ViewTraits; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const type"); using dst_memory_space = typename dst_traits::memory_space; if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2696,8 +2687,8 @@ inline void deep_copy( typename ViewTraits::non_const_value_type& dst, const View& src, std::enable_if_t::value && - std::is_same::specialize, - void>::value>* = nullptr) { + std::is_same_v::specialize, + void>>* = nullptr) { using src_traits = ViewTraits; using src_memory_space = typename src_traits::memory_space; static_assert(src_traits::rank == 0, @@ -2734,8 +2725,8 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) == unsigned(0) && unsigned(ViewTraits::rank) == unsigned(0)))>* = nullptr) { using src_traits = ViewTraits; @@ -2743,8 +2734,8 @@ inline void deep_copy( using src_memory_space = typename src_traits::memory_space; using dst_memory_space = typename dst_traits::memory_space; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires matching non-const destination type"); if (Kokkos::Tools::Experimental::get_callbacks().begin_deep_copy != nullptr) { @@ -2784,15 +2775,15 @@ inline void deep_copy( const View& src, std::enable_if_t< (Kokkos::is_execution_space::value && - std::is_void::specialize>::value && - std::is_void::specialize>::value && + std::is_void_v::specialize> && + std::is_void_v::specialize> && (unsigned(ViewTraits::rank) != 0 || unsigned(ViewTraits::rank) != 0))>* = nullptr) { using dst_type = View; using src_type = View; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "deep_copy requires non-const destination type"); static_assert((unsigned(dst_type::rank) == unsigned(src_type::rank)), @@ -2922,10 +2913,10 @@ inline void deep_copy( // If same type, equal layout, equal dimensions, equal span, and contiguous // memory then can byte-wise copy - if (std::is_same::value && - (std::is_same::value || + if (std::is_same_v && + (std::is_same_v || (dst_type::rank == 1 && src_type::rank == 1)) && dst.span_is_contiguous() && src.span_is_contiguous() && ((dst_type::rank < 1) || (dst.stride_0() == src.stride_0())) && @@ -2994,11 +2985,11 @@ bool size_mismatch(const ViewType& view, unsigned int max_extent, /** \brief Resize a view with copying old data to new data at the corresponding * indices. */ template -inline typename std::enable_if< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value>::type +inline std::enable_if_t< + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, @@ -3048,10 +3039,10 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3066,10 +3057,10 @@ resize(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> resize(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3085,10 +3076,10 @@ template inline std::enable_if_t< (Impl::is_view_ctor_property::value || Kokkos::is_execution_space::value) && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> resize(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3103,12 +3094,12 @@ resize(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3149,12 +3140,12 @@ impl_resize(const Impl::ViewCtorProp& arg_prop, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_resize(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const typename Kokkos::View::array_layout& layout) { @@ -3218,10 +3209,10 @@ inline void resize(Kokkos::View& v, /** \brief Resize a view with discarding old data. */ template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, const size_t n2, const size_t n3, const size_t n4, const size_t n5, const size_t n6, const size_t n7, @@ -3264,10 +3255,10 @@ impl_realloc(Kokkos::View& v, const size_t n0, const size_t n1, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(const Impl::ViewCtorProp& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3283,10 +3274,10 @@ realloc(const Impl::ViewCtorProp& arg_prop, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>> realloc(Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3302,10 +3293,10 @@ realloc(Kokkos::View& v, template inline std::enable_if_t< Impl::is_view_ctor_property::value && - (std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value)> + (std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight>)> realloc(const I& arg_prop, Kokkos::View& v, const size_t n0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, const size_t n1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -3320,12 +3311,12 @@ realloc(const I& arg_prop, Kokkos::View& v, template inline std::enable_if_t< - std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value> + std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3365,12 +3356,12 @@ impl_realloc(Kokkos::View& v, // the same as the existing one. template inline std::enable_if_t< - !(std::is_same::array_layout, - Kokkos::LayoutLeft>::value || - std::is_same::array_layout, - Kokkos::LayoutRight>::value || - std::is_same::array_layout, - Kokkos::LayoutStride>::value)> + !(std::is_same_v::array_layout, + Kokkos::LayoutLeft> || + std::is_same_v::array_layout, + Kokkos::LayoutRight> || + std::is_same_v::array_layout, + Kokkos::LayoutStride>)> impl_realloc(Kokkos::View& v, const typename Kokkos::View::array_layout& layout, const Impl::ViewCtorProp& arg_prop) { @@ -3435,7 +3426,7 @@ struct MirrorViewType { // Check whether it is the same memory space enum { is_same_memspace = - std::is_same::value + std::is_same_v }; // The array_layout using array_layout = typename src_view_type::array_layout; @@ -3450,26 +3441,6 @@ struct MirrorViewType { std::conditional_t; }; -template -struct MirrorType { - // The incoming view_type - using src_view_type = typename Kokkos::View; - // The memory space for the mirror view - using memory_space = typename Space::memory_space; - // Check whether it is the same memory space - enum { - is_same_memspace = - std::is_same::value - }; - // The array_layout - using array_layout = typename src_view_type::array_layout; - // The data type (we probably want it non-const since otherwise we can't even - // deep_copy to it. - using data_type = typename src_view_type::non_const_data_type; - // The destination view type if it is not the same memory space - using view_type = Kokkos::View; -}; - // collection of static asserts for create_mirror and create_mirror_view template void check_view_ctor_args_create_mirror() { @@ -3503,7 +3474,7 @@ inline auto create_mirror(const Kokkos::View& src, if constexpr (Impl::ViewCtorProp::has_memory_space) { using memory_space = typename decltype(prop_copy)::memory_space; using dst_type = - typename Impl::MirrorType::view_type; + typename Impl::MirrorViewType::dest_view_type; return dst_type(prop_copy, src.layout()); } else { using dst_type = typename View::HostMirror; @@ -3636,12 +3607,12 @@ inline auto create_mirror_view( const Kokkos::View& src, [[maybe_unused]] const Impl::ViewCtorProp& arg_prop) { if constexpr (!Impl::ViewCtorProp::has_memory_space) { - if constexpr (std::is_same::memory_space, - typename Kokkos::View< - T, P...>::HostMirror::memory_space>::value && - std::is_same::data_type, - typename Kokkos::View< - T, P...>::HostMirror::data_type>::value) { + if constexpr (std::is_same_v::memory_space, + typename Kokkos::View< + T, P...>::HostMirror::memory_space> && + std::is_same_v< + typename Kokkos::View::data_type, + typename Kokkos::View::HostMirror::data_type>) { check_view_ctor_args_create_mirror(); return typename Kokkos::View::HostMirror(src); } else { @@ -3785,8 +3756,7 @@ create_mirror_view_and_copy( const Space&, const Kokkos::View& src, std::string const& name = "", std::enable_if_t< - std::is_void::specialize>::value>* = - nullptr) { + std::is_void_v::specialize>>* = nullptr) { return create_mirror_view_and_copy( Kokkos::view_alloc(typename Space::memory_space{}, name), src); } diff --git a/lib/kokkos/core/src/Kokkos_Core.hpp b/lib/kokkos/core/src/Kokkos_Core.hpp index 1f146563be..9588d289a9 100644 --- a/lib/kokkos/core/src/Kokkos_Core.hpp +++ b/lib/kokkos/core/src/Kokkos_Core.hpp @@ -63,7 +63,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include @@ -248,9 +250,9 @@ class KOKKOS_ATTRIBUTE_NODISCARD ScopeGuard { } ScopeGuard& operator=(const ScopeGuard&) = delete; - ScopeGuard& operator=(ScopeGuard&&) = delete; - ScopeGuard(const ScopeGuard&) = delete; - ScopeGuard(ScopeGuard&&) = delete; + ScopeGuard& operator=(ScopeGuard&&) = delete; + ScopeGuard(const ScopeGuard&) = delete; + ScopeGuard(ScopeGuard&&) = delete; }; } // namespace Kokkos @@ -281,7 +283,7 @@ std::vector partition_space(ExecSpace const& space, "Kokkos Error: partition_space expects an Execution Space as " "first argument"); static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); std::vector instances(weights.size()); diff --git a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp index 7edb35f00e..5dbe571429 100644 --- a/lib/kokkos/core/src/Kokkos_Core_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_Core_fwd.hpp @@ -106,8 +106,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_HIP) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = HIP; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_SYCL) -using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = - Experimental::SYCL; +using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = SYCL; #elif defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENACC) using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Experimental::OpenACC; @@ -122,7 +121,7 @@ using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = using DefaultExecutionSpace KOKKOS_IMPL_DEFAULT_EXEC_SPACE_ANNOTATION = Serial; #else #error \ - "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::Experimental::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." + "At least one of the following execution spaces must be defined in order to use Kokkos: Kokkos::Cuda, Kokkos::HIP, Kokkos::SYCL, Kokkos::Experimental::OpenMPTarget, Kokkos::Experimental::OpenACC, Kokkos::OpenMP, Kokkos::Threads, Kokkos::Experimental::HPX, or Kokkos::Serial." #endif #if defined(KOKKOS_ENABLE_DEFAULT_DEVICE_TYPE_OPENMP) @@ -162,7 +161,7 @@ using SharedSpace = CudaUVMSpace; using SharedSpace = HIPManagedSpace; #define KOKKOS_HAS_SHARED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) -using SharedSpace = Experimental::SYCLSharedUSMSpace; +using SharedSpace = SYCLSharedUSMSpace; #define KOKKOS_HAS_SHARED_SPACE // if only host compile point to HostSpace #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) @@ -184,7 +183,7 @@ using SharedHostPinnedSpace = CudaHostPinnedSpace; using SharedHostPinnedSpace = HIPHostPinnedSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif defined(KOKKOS_ENABLE_SYCL) - using SharedHostPinnedSpace = Experimental::SYCLHostUSMSpace; + using SharedHostPinnedSpace = SYCLHostUSMSpace; #define KOKKOS_HAS_SHARED_HOST_PINNED_SPACE #elif !defined(KOKKOS_ENABLE_OPENACC) && !defined(KOKKOS_ENABLE_OPENMPTARGET) using SharedHostPinnedSpace = HostSpace; diff --git a/lib/kokkos/core/src/Kokkos_Crs.hpp b/lib/kokkos/core/src/Kokkos_Crs.hpp index 92931b5849..69223b6412 100644 --- a/lib/kokkos/core/src/Kokkos_Crs.hpp +++ b/lib/kokkos/core/src/Kokkos_Crs.hpp @@ -84,12 +84,12 @@ class Crs { /* * Default Constructors, operators and destructor */ - KOKKOS_DEFAULTED_FUNCTION Crs() = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs const&) = default; + KOKKOS_DEFAULTED_FUNCTION Crs(Crs&&) = default; KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs const&) = default; - KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; - KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; + KOKKOS_DEFAULTED_FUNCTION Crs& operator=(Crs&&) = default; + KOKKOS_DEFAULTED_FUNCTION ~Crs() = default; /** \brief Assign to a view of the rhs array. * If the old view is the last view @@ -148,7 +148,7 @@ class GetCrsTransposeCounts { public: KOKKOS_INLINE_FUNCTION - void operator()(index_type i) const { atomic_increment(&out[in.entries(i)]); } + void operator()(index_type i) const { atomic_inc(&out[in.entries(i)]); } GetCrsTransposeCounts(InCrs const& arg_in, OutCounts const& arg_out) : in(arg_in), out(arg_out) { using policy_type = RangePolicy; @@ -345,7 +345,7 @@ struct CountAndFill : public CountAndFillBase { closure.execute(); } auto nentries = Kokkos::get_crs_row_map_from_counts(this->m_crs.row_map, - this->m_counts); + this->m_counts); this->m_counts = counts_type(); this->m_crs.entries = entries_type("entries", nentries); { diff --git a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp index ae28805a42..8af10b2a40 100644 --- a/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp +++ b/lib/kokkos/core/src/Kokkos_DetectionIdiom.hpp @@ -54,8 +54,8 @@ struct detector>, Op, Args...> { } // namespace Impl struct nonesuch : private Impl::nonesuch_base { - ~nonesuch() = delete; - nonesuch(nonesuch const&) = delete; + ~nonesuch() = delete; + nonesuch(nonesuch const&) = delete; void operator=(nonesuch const&) = delete; }; @@ -81,7 +81,7 @@ inline constexpr bool is_detected_v = is_detected::value; template class Op, class... Args> inline constexpr bool is_detected_exact_v = - is_detected_exact::value; + is_detected_exact::value; // NOLINT template class Op, class... Args> inline constexpr bool is_detected_convertible_v = diff --git a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp index b8d7f77deb..dd7ce5ce21 100644 --- a/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_ExecPolicy.hpp @@ -27,7 +27,10 @@ static_assert(false, #include #include #include +#include +#ifndef KOKKOS_ENABLE_IMPL_TYPEINFO #include +#endif #include //---------------------------------------------------------------------------- @@ -197,8 +200,7 @@ class RangePolicy : public Impl::PolicyTraits { /** \brief finalize chunk_size if it was set to AUTO*/ inline void set_auto_chunk_size() { #ifdef KOKKOS_ENABLE_SYCL - if (std::is_same_v) { + if (std::is_same_v) { // chunk_size <=1 lets the compiler choose the workgroup size when // launching kernels m_granularity = 1; @@ -248,46 +250,49 @@ class RangePolicy : public Impl::PolicyTraits { // To be replaced with std::in_range (c++20) template - static void check_conversion_safety(const IndexType bound) { + static void check_conversion_safety([[maybe_unused]] const IndexType bound) { + // Checking that the round-trip conversion preserves input index value + if constexpr (std::is_convertible_v) { #if !defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) || \ defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) - std::string msg = - "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " - "is performed on a bound (" + - std::to_string(bound) + - "), which may " - "not preserve its original value.\n"; - bool warn = false; + std::string msg = + "Kokkos::RangePolicy bound type error: an unsafe implicit conversion " + "is performed on a bound (" + + std::to_string(bound) + + "), which may " + "not preserve its original value.\n"; + bool warn = false; - if constexpr (std::is_signed_v != - std::is_signed_v) { - // check signed to unsigned - if constexpr (std::is_signed_v) - warn |= (bound < static_cast( - std::numeric_limits::min())); + if constexpr (std::is_arithmetic_v && + (std::is_signed_v != + std::is_signed_v)) { + // check signed to unsigned + if constexpr (std::is_signed_v) + warn |= (bound < static_cast( + std::numeric_limits::min())); - // check unsigned to signed - if constexpr (std::is_signed_v) - warn |= (bound > static_cast( - std::numeric_limits::max())); - } + // check unsigned to signed + if constexpr (std::is_signed_v) + warn |= (bound > static_cast( + std::numeric_limits::max())); + } - // check narrowing - warn |= (static_cast(static_cast(bound)) != bound); + // check narrowing + warn |= + (static_cast(static_cast(bound)) != bound); - if (warn) { + if (warn) { #ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 - Kokkos::abort(msg.c_str()); + Kokkos::abort(msg.c_str()); #endif #ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS - Kokkos::Impl::log_warning(msg); + Kokkos::Impl::log_warning(msg); +#endif + } #endif } -#else - (void)bound; -#endif } public: @@ -333,20 +338,20 @@ class RangePolicy : public Impl::PolicyTraits { }; }; -RangePolicy()->RangePolicy<>; +RangePolicy() -> RangePolicy<>; -RangePolicy(int64_t, int64_t)->RangePolicy<>; -RangePolicy(int64_t, int64_t, ChunkSize const&)->RangePolicy<>; +RangePolicy(int64_t, int64_t) -> RangePolicy<>; +RangePolicy(int64_t, int64_t, ChunkSize const&) -> RangePolicy<>; -RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t)->RangePolicy<>; +RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t) -> RangePolicy<>; RangePolicy(DefaultExecutionSpace const&, int64_t, int64_t, ChunkSize const&) - ->RangePolicy<>; + -> RangePolicy<>; template >> -RangePolicy(ES const&, int64_t, int64_t)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t) -> RangePolicy; template >> -RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&)->RangePolicy; +RangePolicy(ES const&, int64_t, int64_t, ChunkSize const&) -> RangePolicy; } // namespace Kokkos @@ -515,24 +520,24 @@ struct PerThreadValue { template struct ExtractVectorLength { static inline iType value( - std::enable_if_t::value, iType> val, Args...) { + std::enable_if_t, iType> val, Args...) { return val; } - static inline std::enable_if_t::value, int> value( - std::enable_if_t::value, iType>, Args...) { + static inline std::enable_if_t, int> value( + std::enable_if_t, iType>, Args...) { return 1; } }; template -inline std::enable_if_t::value, iType> -extract_vector_length(iType val, Args...) { +inline std::enable_if_t, iType> extract_vector_length( + iType val, Args...) { return val; } template -inline std::enable_if_t::value, int> -extract_vector_length(iType, Args...) { +inline std::enable_if_t, int> extract_vector_length( + iType, Args...) { return 1; } @@ -577,7 +582,7 @@ struct ScratchRequest { } }; -// Throws a runtime exception if level is not `0` or `1` +// Causes abnormal program termination if level is not `0` or `1` void team_policy_check_valid_storage_level_argument(int level); /** \brief Execution policy for parallel work over a league of teams of @@ -721,55 +726,54 @@ class TeamPolicy // Execution space not provided deduces to TeamPolicy<> -TeamPolicy()->TeamPolicy<>; +TeamPolicy() -> TeamPolicy<>; -TeamPolicy(int, int)->TeamPolicy<>; -TeamPolicy(int, int, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, int)->TeamPolicy<>; -TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&)->TeamPolicy<>; -TeamPolicy(int, int, Kokkos::AUTO_t const&)->TeamPolicy<>; +TeamPolicy(int, int) -> TeamPolicy<>; +TeamPolicy(int, int, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, int) -> TeamPolicy<>; +TeamPolicy(int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) -> TeamPolicy<>; +TeamPolicy(int, int, Kokkos::AUTO_t const&) -> TeamPolicy<>; // DefaultExecutionSpace deduces to TeamPolicy<> -TeamPolicy(DefaultExecutionSpace const&, int, int)->TeamPolicy<>; -TeamPolicy(DefaultExecutionSpace const&, int, int, int)->TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int) -> TeamPolicy<>; +TeamPolicy(DefaultExecutionSpace const&, int, int, int) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, int) - ->TeamPolicy<>; + -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, Kokkos::AUTO_t const&, - Kokkos::AUTO_t const&) - ->TeamPolicy<>; + Kokkos::AUTO_t const&) -> TeamPolicy<>; TeamPolicy(DefaultExecutionSpace const&, int, int, Kokkos::AUTO_t const&) - ->TeamPolicy<>; + -> TeamPolicy<>; // ES != DefaultExecutionSpace deduces to TeamPolicy template >> -TeamPolicy(ES const&, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, int)->TeamPolicy; +TeamPolicy(ES const&, int, int, int) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&) -> TeamPolicy; template >> -TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int)->TeamPolicy; +TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, int) -> TeamPolicy; template >> TeamPolicy(ES const&, int, Kokkos::AUTO_t const&, Kokkos::AUTO_t const&) - ->TeamPolicy; + -> TeamPolicy; template >> -TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&)->TeamPolicy; +TeamPolicy(ES const&, int, int, Kokkos::AUTO_t const&) -> TeamPolicy; namespace Impl { @@ -1041,7 +1045,7 @@ struct TeamThreadMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamThreadMDRange(TeamHandle const&, Args&&...) - ->TeamThreadMDRange, TeamHandle>; + -> TeamThreadMDRange, TeamHandle>; template struct ThreadVectorMDRange; @@ -1078,7 +1082,7 @@ struct ThreadVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE ThreadVectorMDRange(TeamHandle const&, Args&&...) - ->ThreadVectorMDRange, TeamHandle>; + -> ThreadVectorMDRange, TeamHandle>; template struct TeamVectorMDRange; @@ -1115,7 +1119,7 @@ struct TeamVectorMDRange, TeamHandle> { template KOKKOS_DEDUCTION_GUIDE TeamVectorMDRange(TeamHandle const&, Args&&...) - ->TeamVectorMDRange, TeamHandle>; + -> TeamVectorMDRange, TeamHandle>; template @@ -1162,7 +1166,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1198,7 +1202,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( Kokkos::HIP> #elif defined(KOKKOS_ENABLE_SYCL) || std::is_same_v + Kokkos::SYCL> #endif ) policy.team.vector_reduce( @@ -1217,15 +1221,21 @@ KOKKOS_INLINE_FUNCTION void parallel_for( namespace Impl { template ::value> + bool HasTag = !std::is_void_v> struct ParallelConstructName; template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = + std::string(TypeInfo>::name()) + + "/" + std::string(TypeInfo::name()); +#else default_name = std::string(typeid(FunctorType).name()) + "/" + typeid(TagType).name(); +#endif } } std::string const& get() { @@ -1239,7 +1249,11 @@ template struct ParallelConstructName { ParallelConstructName(std::string const& label) : label_ref(label) { if (label.empty()) { - default_name = std::string(typeid(FunctorType).name()); +#ifdef KOKKOS_ENABLE_IMPL_TYPEINFO + default_name = TypeInfo>::name(); +#else + default_name = typeid(FunctorType).name(); +#endif } } std::string const& get() { diff --git a/lib/kokkos/core/src/Kokkos_Extents.hpp b/lib/kokkos/core/src/Kokkos_Extents.hpp index 9bc2eda604..7d1f8c755d 100644 --- a/lib/kokkos/core/src/Kokkos_Extents.hpp +++ b/lib/kokkos/core/src/Kokkos_Extents.hpp @@ -134,7 +134,7 @@ struct ApplyExtent { template struct ApplyExtent { - using type = ValueType * [Ext]; + using type = ValueType* [Ext]; }; template diff --git a/lib/kokkos/core/src/Kokkos_Future.hpp b/lib/kokkos/core/src/Kokkos_Future.hpp index 0b3a153de8..c26d08be1c 100644 --- a/lib/kokkos/core/src/Kokkos_Future.hpp +++ b/lib/kokkos/core/src/Kokkos_Future.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_FUTURE_HPP #define KOKKOS_FUTURE_HPP @@ -41,13 +47,19 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // For now, hack this in as a partial specialization // TODO @tasking @cleanup Make this the "normal" class template and make the old // code the specialization template -class BasicFuture> { +class KOKKOS_DEPRECATED + BasicFuture> { public: using value_type = ValueType; using execution_space = ExecutionSpace; @@ -244,7 +256,7 @@ class BasicFuture> { //////////////////////////////////////////////////////////////////////////////// template -class BasicFuture { +class KOKKOS_DEPRECATED BasicFuture { private: template friend class BasicTaskScheduler; @@ -413,13 +425,13 @@ class BasicFuture { // Is a Future with the given execution space template -struct is_future : public std::false_type {}; +struct KOKKOS_DEPRECATED is_future : public std::false_type {}; template -struct is_future, ExecSpace> +struct KOKKOS_DEPRECATED is_future, ExecSpace> : std::bool_constant< - std::is_same::value || - std::is_void::value> {}; + std::is_same_v || + std::is_void_v> {}; //////////////////////////////////////////////////////////////////////////////// // END OLD CODE @@ -432,8 +444,8 @@ class ResolveFutureArgOrder { private: enum { Arg1_is_space = Kokkos::is_space::value }; enum { Arg2_is_space = Kokkos::is_space::value }; - enum { Arg1_is_value = !Arg1_is_space && !std::is_void::value }; - enum { Arg2_is_value = !Arg2_is_space && !std::is_void::value }; + enum { Arg1_is_value = !Arg1_is_space && !std::is_void_v }; + enum { Arg2_is_value = !Arg2_is_space && !std::is_void_v }; static_assert(!(Arg1_is_space && Arg2_is_space), "Future cannot be given two spaces"); @@ -463,10 +475,15 @@ class ResolveFutureArgOrder { * */ template -using Future = typename Impl::ResolveFutureArgOrder::type; +using Future KOKKOS_DEPRECATED = + typename Impl::ResolveFutureArgOrder::type; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_Graph.hpp b/lib/kokkos/core/src/Kokkos_Graph.hpp index 9cc6650e26..05d774ac61 100644 --- a/lib/kokkos/core/src/Kokkos_Graph.hpp +++ b/lib/kokkos/core/src/Kokkos_Graph.hpp @@ -86,10 +86,21 @@ struct [[nodiscard]] Graph { return m_impl_ptr->get_execution_space(); } - void submit() const { + void instantiate() { KOKKOS_EXPECTS(bool(m_impl_ptr)) - (*m_impl_ptr).submit(); + (*m_impl_ptr).instantiate(); } + + void submit(const execution_space& exec) const { + KOKKOS_EXPECTS(bool(m_impl_ptr)) + (*m_impl_ptr).submit(exec); + } + + void submit() const { submit(get_execution_space()); } + + decltype(auto) native_graph(); + + decltype(auto) native_graph_exec(); }; // end Graph }}}1 @@ -135,22 +146,68 @@ Graph create_graph(ExecutionSpace ex, Closure&& arg_closure) { // function template injection works. auto rv = Kokkos::Impl::GraphAccess::construct_graph(std::move(ex)); // Invoke the user's graph construction closure - ((Closure &&) arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); + ((Closure&&)arg_closure)(Kokkos::Impl::GraphAccess::create_root_ref(rv)); // and given them back the graph // KOKKOS_ENSURES(rv.m_impl_ptr.use_count() == 1) return rv; } +template +std::enable_if_t, + Graph> +create_graph(ExecutionSpace exec = ExecutionSpace{}) { + return Kokkos::Impl::GraphAccess::construct_graph(std::move(exec)); +} + template < class ExecutionSpace = DefaultExecutionSpace, class Closure = Kokkos::Impl::DoNotExplicitlySpecifyThisTemplateParameter> -Graph create_graph(Closure&& arg_closure) { - return create_graph(ExecutionSpace{}, (Closure &&) arg_closure); +std::enable_if_t< + !Kokkos::is_execution_space_v>, + Graph> +create_graph(Closure&& arg_closure) { + return create_graph(ExecutionSpace{}, (Closure&&)arg_closure); } // end create_graph }}}1 //============================================================================== +template +decltype(auto) Graph::native_graph() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph(); + } +#endif +} + +template +decltype(auto) Graph::native_graph_exec() { + KOKKOS_EXPECTS(bool(m_impl_ptr)); +#if defined(KOKKOS_ENABLE_CUDA) + if constexpr (std::is_same_v) { + return m_impl_ptr->cuda_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_HIP) && defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->hip_graph_exec(); + } +#elif defined(KOKKOS_ENABLE_SYCL) && defined(SYCL_EXT_ONEAPI_GRAPH) + if constexpr (std::is_same_v) { + return m_impl_ptr->sycl_graph_exec(); + } +#endif +} + } // end namespace Experimental } // namespace Kokkos @@ -163,7 +220,7 @@ Graph create_graph(Closure&& arg_closure) { #include #if defined(KOKKOS_ENABLE_HIP) // The implementation of hipGraph in ROCm 5.2 is bugged, so we cannot use it. -#if !((HIP_VERSION_MAJOR == 5) && (HIP_VERSION_MINOR == 2)) +#if defined(KOKKOS_IMPL_HIP_NATIVE_GRAPH) #include #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_GraphNode.hpp b/lib/kokkos/core/src/Kokkos_GraphNode.hpp index 2a4e2cf641..a0a60c07d0 100644 --- a/lib/kokkos/core/src/Kokkos_GraphNode.hpp +++ b/lib/kokkos/core/src/Kokkos_GraphNode.hpp @@ -48,7 +48,7 @@ class GraphNodeRef { // intended to be SFINAE-safe, so do validation before you instantiate. static_assert( - std::is_same::value || + std::is_same_v || Kokkos::Impl::is_specialization_of::value, "Invalid predecessor template parameter given to GraphNodeRef"); @@ -56,7 +56,7 @@ class GraphNodeRef { Kokkos::is_execution_space::value, "Invalid execution space template parameter given to GraphNodeRef"); - static_assert(std::is_same::value || + static_assert(std::is_same_v || Kokkos::Impl::is_graph_kernel::value, "Invalid kernel template parameter given to GraphNodeRef"); @@ -151,7 +151,7 @@ class GraphNodeRef { typename return_t::node_impl_t>( m_node_impl->execution_space_instance(), Kokkos::Impl::_graph_node_kernel_ctor_tag{}, - (NextKernelDeduced &&) arg_kernel, + (NextKernelDeduced&&)arg_kernel, // *this is the predecessor Kokkos::Impl::_graph_node_predecessor_ctor_tag{}, *this)); @@ -184,10 +184,10 @@ class GraphNodeRef { // {{{3 // Copyable and movable (basically just shared_ptr semantics - GraphNodeRef() noexcept = default; - GraphNodeRef(GraphNodeRef const&) = default; - GraphNodeRef(GraphNodeRef&&) noexcept = default; - GraphNodeRef& operator=(GraphNodeRef const&) = default; + GraphNodeRef() noexcept = default; + GraphNodeRef(GraphNodeRef const&) = default; + GraphNodeRef(GraphNodeRef&&) noexcept = default; + GraphNodeRef& operator=(GraphNodeRef const&) = default; GraphNodeRef& operator=(GraphNodeRef&&) noexcept = default; ~GraphNodeRef() = default; @@ -197,19 +197,19 @@ class GraphNodeRef { //- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - // {{{3 - template < - class OtherKernel, class OtherPredecessor, - std::enable_if_t< - // Not a copy/move constructor - !std::is_same>::value && - // must be an allowed type erasure of the kernel - Kokkos::Impl::is_compatible_type_erasure::value && - // must be an allowed type erasure of the predecessor - Kokkos::Impl::is_compatible_type_erasure< - OtherPredecessor, graph_predecessor>::value, - int> = 0> + template > && + // must be an allowed type erasure of the kernel + Kokkos::Impl::is_compatible_type_erasure< + OtherKernel, graph_kernel>::value && + // must be an allowed type erasure of the predecessor + Kokkos::Impl::is_compatible_type_erasure< + OtherPredecessor, graph_predecessor>::value, + int> = 0> /* implicit */ GraphNodeRef( GraphNodeRef const& other) @@ -257,7 +257,7 @@ class GraphNodeRef { //|| policy_t::execution_space_is_defaulted, "Execution Space mismatch between execution policy and graph"); - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using next_policy_t = decltype(policy); @@ -266,8 +266,8 @@ class GraphNodeRef { std::decay_t, Kokkos::ParallelForTag>; return this->_then_kernel(next_kernel_t{std::move(arg_name), policy.space(), - (Functor &&) functor, - (Policy &&) policy}); + (Functor&&)functor, + (Policy&&)policy}); } template < @@ -280,8 +280,7 @@ class GraphNodeRef { int> = 0> auto then_parallel_for(Policy&& policy, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", (Policy &&) policy, - (Functor &&) functor); + return this->then_parallel_for("", (Policy&&)policy, (Functor&&)functor); } template @@ -290,13 +289,13 @@ class GraphNodeRef { // needs to static assert constraint: DataParallelFunctor return this->then_parallel_for(std::move(name), Kokkos::RangePolicy(0, n), - (Functor &&) functor); + (Functor&&)functor); } template auto then_parallel_for(std::size_t n, Functor&& functor) const { // needs to static assert constraint: DataParallelFunctor - return this->then_parallel_for("", n, (Functor &&) functor); + return this->then_parallel_for("", n, (Functor&&)functor); } // end then_parallel_for }}}2 @@ -359,6 +358,23 @@ class GraphNodeRef { Kokkos::is_reducer::value, "Output argument to parallel reduce in a graph must be a " "View or a Reducer"); + + if constexpr (Kokkos::is_reducer_v) { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, typename return_type_remove_cvref:: + result_view_type::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } else { + static_assert( + Kokkos::SpaceAccessibility< + ExecutionSpace, + typename return_type_remove_cvref::memory_space>::accessible, + "The reduction target must be accessible by the graph execution " + "space."); + } + using return_type = // Yes, you do really have to do this... std::conditional_t::value, @@ -373,7 +389,7 @@ class GraphNodeRef { // End of Kokkos reducer disaster //---------------------------------------- - auto policy = Experimental::require((Policy &&) arg_policy, + auto policy = Experimental::require((Policy&&)arg_policy, Kokkos::Impl::KernelInGraphProperty{}); using passed_reducer_type = typename return_value_adapter::reducer_type; @@ -399,7 +415,7 @@ class GraphNodeRef { return this->_then_kernel(next_kernel_t{ std::move(arg_name), graph_impl_ptr->get_execution_space(), - functor_reducer, (Policy &&) policy, + functor_reducer, (Policy&&)policy, return_value_adapter::return_value(return_value, functor)}); } @@ -413,9 +429,9 @@ class GraphNodeRef { int> = 0> auto then_parallel_reduce(Policy&& arg_policy, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", (Policy &&) arg_policy, - (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", (Policy&&)arg_policy, + (Functor&&)functor, + (ReturnType&&)return_value); } template @@ -425,15 +441,15 @@ class GraphNodeRef { ReturnType&& return_value) const { return this->then_parallel_reduce( std::move(label), Kokkos::RangePolicy{0, idx_end}, - (Functor &&) functor, (ReturnType &&) return_value); + (Functor&&)functor, (ReturnType&&)return_value); } template auto then_parallel_reduce(typename execution_space::size_type idx_end, Functor&& functor, ReturnType&& return_value) const { - return this->then_parallel_reduce("", idx_end, (Functor &&) functor, - (ReturnType &&) return_value); + return this->then_parallel_reduce("", idx_end, (Functor&&)functor, + (ReturnType&&)return_value); } // end then_parallel_reduce }}}2 diff --git a/lib/kokkos/core/src/Kokkos_HostSpace.hpp b/lib/kokkos/core/src/Kokkos_HostSpace.hpp index 8b5f29f95b..706586826f 100644 --- a/lib/kokkos/core/src/Kokkos_HostSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_HostSpace.hpp @@ -63,10 +63,10 @@ class HostSpace { //! This memory space preferred device_type using device_type = Kokkos::Device; - HostSpace() = default; - HostSpace(HostSpace&& rhs) = default; - HostSpace(const HostSpace& rhs) = default; - HostSpace& operator=(HostSpace&&) = default; + HostSpace() = default; + HostSpace(HostSpace&& rhs) = default; + HostSpace(const HostSpace& rhs) = default; + HostSpace& operator=(HostSpace&&) = default; HostSpace& operator=(const HostSpace&) = default; ~HostSpace() = default; @@ -183,18 +183,6 @@ namespace Kokkos { namespace Impl { -template <> -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - hostspace_parallel_deepcopy(dst, src, n); - } - - DeepCopy(const DefaultHostExecutionSpace& exec, void* dst, const void* src, - size_t n) { - hostspace_parallel_deepcopy_async(exec, dst, src, n); - } -}; - template struct DeepCopy { DeepCopy(void* dst, const void* src, size_t n) { @@ -202,10 +190,15 @@ struct DeepCopy { } DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); - hostspace_parallel_deepcopy_async(dst, src, n); + if constexpr (!Kokkos::SpaceAccessibility::accessible) { + exec.fence( + "Kokkos::Impl::DeepCopy::DeepCopy: fence before copy"); + hostspace_parallel_deepcopy_async(dst, src, n); + } else { + hostspace_parallel_deepcopy_async(exec, dst, src, n); + } } }; diff --git a/lib/kokkos/core/src/Kokkos_Layout.hpp b/lib/kokkos/core/src/Kokkos_Layout.hpp index 37b80e54a8..a760e7054a 100644 --- a/lib/kokkos/core/src/Kokkos_Layout.hpp +++ b/lib/kokkos/core/src/Kokkos_Layout.hpp @@ -52,13 +52,17 @@ struct LayoutLeft { using array_layout = LayoutLeft; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutLeft(LayoutLeft const&) = default; - LayoutLeft(LayoutLeft&&) = default; + LayoutLeft(LayoutLeft const&) = default; + LayoutLeft(LayoutLeft&&) = default; LayoutLeft& operator=(LayoutLeft const&) = default; - LayoutLeft& operator=(LayoutLeft&&) = default; + LayoutLeft& operator=(LayoutLeft&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutLeft(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -69,7 +73,8 @@ struct LayoutLeft { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride(KOKKOS_IMPL_CTOR_DEFAULT_ARG) {} friend bool operator==(const LayoutLeft& left, const LayoutLeft& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -101,13 +106,17 @@ struct LayoutRight { using array_layout = LayoutRight; size_t dimension[ARRAY_LAYOUT_MAX_RANK]; + // we don't have a constructor to set the stride directly + // but we will deprecate the class anyway (or at least using an instance of + // this class) when switching the internal implementation to use mdspan + size_t stride; enum : bool { is_extent_constructible = true }; - LayoutRight(LayoutRight const&) = default; - LayoutRight(LayoutRight&&) = default; + LayoutRight(LayoutRight const&) = default; + LayoutRight(LayoutRight&&) = default; LayoutRight& operator=(LayoutRight const&) = default; - LayoutRight& operator=(LayoutRight&&) = default; + LayoutRight& operator=(LayoutRight&&) = default; KOKKOS_INLINE_FUNCTION explicit constexpr LayoutRight(size_t N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -118,7 +127,8 @@ struct LayoutRight { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{KOKKOS_IMPL_CTOR_DEFAULT_ARG} {} friend bool operator==(const LayoutRight& left, const LayoutRight& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) @@ -144,10 +154,10 @@ struct LayoutStride { enum : bool { is_extent_constructible = false }; - LayoutStride(LayoutStride const&) = default; - LayoutStride(LayoutStride&&) = default; + LayoutStride(LayoutStride const&) = default; + LayoutStride(LayoutStride&&) = default; LayoutStride& operator=(LayoutStride const&) = default; - LayoutStride& operator=(LayoutStride&&) = default; + LayoutStride& operator=(LayoutStride&&) = default; /** \brief Compute strides from ordered dimensions. * @@ -191,8 +201,8 @@ struct LayoutStride { size_t N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S5 = 0, size_t N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S6 = 0, size_t N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, size_t S7 = 0) - : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, stride{S0, S1, S2, S3, - S4, S5, S6, S7} {} + : dimension{N0, N1, N2, N3, N4, N5, N6, N7}, + stride{S0, S1, S2, S3, S4, S5, S6, S7} {} friend bool operator==(const LayoutStride& left, const LayoutStride& right) { for (unsigned int rank = 0; rank < ARRAY_LAYOUT_MAX_RANK; ++rank) diff --git a/lib/kokkos/core/src/Kokkos_Macros.hpp b/lib/kokkos/core/src/Kokkos_Macros.hpp index 0a0acd303f..97b78a3c64 100644 --- a/lib/kokkos/core/src/Kokkos_Macros.hpp +++ b/lib/kokkos/core/src/Kokkos_Macros.hpp @@ -27,7 +27,7 @@ * KOKKOS_ENABLE_OPENMPTARGET Kokkos::Experimental::OpenMPTarget * execution space * KOKKOS_ENABLE_HIP Kokkos::HIP execution space - * KOKKOS_ENABLE_SYCL Kokkos::Experimental::SYCL execution space + * KOKKOS_ENABLE_SYCL Kokkos::SYCL execution space * KOKKOS_ENABLE_HWLOC HWLOC library is available. * KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK Insert array bounds checks, is expensive! * KOKKOS_ENABLE_CUDA_UVM Use CUDA UVM for Cuda memory space. @@ -132,7 +132,7 @@ #define KOKKOS_CLASS_LAMBDA [ =, *this ] #endif -//#if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. +// #if !defined( __CUDA_ARCH__ ) // Not compiling Cuda code to 'ptx'. // Intel compiler for host code. @@ -252,10 +252,10 @@ // CLANG compiler macros #if defined(KOKKOS_COMPILER_CLANG) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -273,10 +273,10 @@ // GNU Compiler macros #if defined(KOKKOS_COMPILER_GNU) -//#define KOKKOS_ENABLE_PRAGMA_UNROLL 1 -//#define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 -//#define KOKKOS_ENABLE_PRAGMA_VECTOR 1 +// #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 +// #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #if !defined(KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION) #define KOKKOS_IMPL_HOST_FORCEINLINE_FUNCTION \ @@ -298,7 +298,7 @@ #if defined(KOKKOS_COMPILER_NVHPC) #define KOKKOS_ENABLE_PRAGMA_UNROLL 1 #define KOKKOS_ENABLE_PRAGMA_IVDEP 1 -//#define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 +// #define KOKKOS_ENABLE_PRAGMA_LOOPCOUNT 1 #define KOKKOS_ENABLE_PRAGMA_VECTOR 1 #endif @@ -357,6 +357,21 @@ #define KOKKOS_IMPL_DEVICE_FUNCTION #endif +// FIXME_OPENACC FIXME_OPENMPTARGET +// Move to setup files once there is more content +// clang-format off +#if defined(KOKKOS_ENABLE_OPENACC) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenACC backend" +#endif +#if defined(KOKKOS_ENABLE_OPENMPTARGET) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION @"KOKKOS_RELOCATABLE_FUNCTION is not supported for the OpenMPTarget backend" +#endif +// clang-format on + +#if !defined(KOKKOS_IMPL_RELOCATABLE_FUNCTION) +#define KOKKOS_IMPL_RELOCATABLE_FUNCTION +#endif + //---------------------------------------------------------------------------- // Define final version of functions. This is so that clang tidy can find these // macros more easily @@ -369,10 +384,14 @@ #define KOKKOS_FORCEINLINE_FUNCTION \ KOKKOS_IMPL_FORCEINLINE_FUNCTION \ __attribute__((annotate("KOKKOS_FORCEINLINE_FUNCTION"))) +#define KOKKOS_RELOCATABLE_FUNCTION \ + KOKKOS_IMPL_RELOCATABLE_FUNCTION \ + __attribute__((annotate("KOKKOS_RELOCATABLE_FUNCTION"))) #else #define KOKKOS_FUNCTION KOKKOS_IMPL_FUNCTION #define KOKKOS_INLINE_FUNCTION KOKKOS_IMPL_INLINE_FUNCTION #define KOKKOS_FORCEINLINE_FUNCTION KOKKOS_IMPL_FORCEINLINE_FUNCTION +#define KOKKOS_RELOCATABLE_FUNCTION KOKKOS_IMPL_RELOCATABLE_FUNCTION #endif //---------------------------------------------------------------------------- @@ -537,14 +556,17 @@ static constexpr bool kokkos_omp_on_host() { return false; } // If compiling with CUDA, we must use relocatable device code to enable the // task policy. +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #if defined(KOKKOS_ENABLE_CUDA) #if defined(KOKKOS_ENABLE_CUDA_RELOCATABLE_DEVICE_CODE) #define KOKKOS_ENABLE_TASKDAG #endif // FIXME_SYCL Tasks not implemented -#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) +#elif !defined(KOKKOS_ENABLE_HIP) && !defined(KOKKOS_ENABLE_SYCL) && \ + !defined(KOKKOS_ENABLE_OPENMPTARGET) #define KOKKOS_ENABLE_TASKDAG #endif +#endif #if defined(KOKKOS_ENABLE_CUDA) && defined(KOKKOS_ENABLE_DEPRECATED_CODE_4) #define KOKKOS_ENABLE_CUDA_LDG_INTRINSIC @@ -582,9 +604,11 @@ static constexpr bool kokkos_omp_on_host() { return false; } // clang-format off #if defined(__NVCOMPILER) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ - _Pragma("diag_suppress 1216") + _Pragma("diag_suppress 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() \ - _Pragma("diag_default 1216") + _Pragma("diag_default 1216") \ + _Pragma("diag_suppress deprecated_entity_with_custom_message") #elif defined(__EDG__) #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() \ _Pragma("warning push") \ @@ -607,6 +631,18 @@ static constexpr bool kokkos_omp_on_host() { return false; } #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #define KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif + +#if defined(__NVCOMPILER) +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() \ + _Pragma("diag_suppress code_is_unreachable") \ + _Pragma("diag_suppress initialization_not_reachable") +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() \ + _Pragma("diag_default code_is_unreachable") \ + _Pragma("diag_default initialization_not_reachable") +#else +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_PUSH() +#define KOKKOS_IMPL_DISABLE_UNREACHABLE_WARNINGS_POP() +#endif // clang-format on #define KOKKOS_ATTRIBUTE_NODISCARD [[nodiscard]] diff --git a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp index ce8c9e152f..f7e9e2a78c 100644 --- a/lib/kokkos/core/src/Kokkos_MemoryPool.hpp +++ b/lib/kokkos/core/src/Kokkos_MemoryPool.hpp @@ -196,9 +196,10 @@ class MemoryPool { stats.consumed_superblocks++; stats.consumed_blocks += block_used; - stats.consumed_bytes += block_used * block_size; + stats.consumed_bytes += static_cast(block_used) * block_size; stats.reserved_blocks += block_count - block_used; - stats.reserved_bytes += (block_count - block_used) * block_size; + stats.reserved_bytes += + static_cast(block_count - block_used) * block_size; } } @@ -234,9 +235,9 @@ class MemoryPool { //-------------------------------------------------------------------------- - KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; - KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(MemoryPool &&) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool(const MemoryPool &) = default; + KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(MemoryPool &&) = default; KOKKOS_DEFAULTED_FUNCTION MemoryPool &operator=(const MemoryPool &) = default; KOKKOS_INLINE_FUNCTION MemoryPool() diff --git a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp index 118bf52c05..1304d3ba92 100644 --- a/lib/kokkos/core/src/Kokkos_NumericTraits.hpp +++ b/lib/kokkos/core/src/Kokkos_NumericTraits.hpp @@ -114,7 +114,7 @@ template <> struct signaling_NaN_helper { static constexpr long dou #endif template struct digits_helper {}; template <> struct digits_helper { static constexpr int value = 1; }; -template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed::value; }; +template <> struct digits_helper { static constexpr int value = CHAR_BIT - std::is_signed_v; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT - 1; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT; }; template <> struct digits_helper { static constexpr int value = CHAR_BIT*sizeof(short)-1; }; diff --git a/lib/kokkos/core/src/Kokkos_Pair.hpp b/lib/kokkos/core/src/Kokkos_Pair.hpp index e569fefc14..c44d1f2310 100644 --- a/lib/kokkos/core/src/Kokkos_Pair.hpp +++ b/lib/kokkos/core/src/Kokkos_Pair.hpp @@ -449,7 +449,8 @@ struct KOKKOS_DEPRECATED pair { // Specialization of relational operators for Kokkos::pair. // -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() #endif template @@ -487,7 +488,8 @@ KOKKOS_DEPRECATED KOKKOS_FORCEINLINE_FUNCTION constexpr bool operator>=( const pair& lhs, const pair& rhs) { return !(lhs < rhs); } -#if defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) +#if defined(KOKKOS_ENABLE_DEPRECATION_WARNINGS) && \ + defined(KOKKOS_COMPILER_GNU) && (KOKKOS_COMPILER_GNU < 1110) KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() #endif #endif diff --git a/lib/kokkos/core/src/Kokkos_Parallel.hpp b/lib/kokkos/core/src/Kokkos_Parallel.hpp index 122239df79..24349e95ae 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel.hpp @@ -72,19 +72,19 @@ struct FunctorPolicyExecutionSpace { static_assert( !is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with an execution space " "are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A policy with an execution space and a functor with a device " "type are given but the execution space types do not match!"); static_assert(!is_detected::value || !is_detected::value || - std::is_same::value, + std::is_same_v, "A functor with both an execution space and device type is " "given but their execution space types do not match!"); @@ -134,8 +134,10 @@ inline void parallel_for(const std::string& str, const ExecPolicy& policy, const FunctorType& functor) { uint64_t kpID = 0; - ExecPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_for(inner_policy, functor, str, kpID); + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_for(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< @@ -348,9 +350,11 @@ template ::value>> inline void parallel_scan(const std::string& str, const ExecutionPolicy& policy, const FunctorType& functor) { - uint64_t kpID = 0; - ExecutionPolicy inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_scan(inner_policy, functor, str, kpID); + uint64_t kpID = 0; + /** Request a tuned policy from the tools subsystem */ + const auto& response = + Kokkos::Tools::Impl::begin_parallel_scan(policy, functor, str, kpID); + const auto& inner_policy = response.policy; auto closure = Kokkos::Impl::construct_with_shared_allocation_tracking_disabled< diff --git a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp index 53913266f1..3b89d184f2 100644 --- a/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp +++ b/lib/kokkos/core/src/Kokkos_Parallel_Reduce.hpp @@ -73,7 +73,7 @@ struct Sum { template KOKKOS_DEDUCTION_GUIDE Sum(View const&) - ->Sum::memory_space>; + -> Sum::memory_space>; template struct Prod { @@ -118,7 +118,7 @@ struct Prod { template KOKKOS_DEDUCTION_GUIDE Prod(View const&) - ->Prod::memory_space>; + -> Prod::memory_space>; template struct Min { @@ -165,7 +165,7 @@ struct Min { template KOKKOS_DEDUCTION_GUIDE Min(View const&) - ->Min::memory_space>; + -> Min::memory_space>; template struct Max { @@ -213,7 +213,7 @@ struct Max { template KOKKOS_DEDUCTION_GUIDE Max(View const&) - ->Max::memory_space>; + -> Max::memory_space>; template struct LAnd { @@ -259,7 +259,7 @@ struct LAnd { template KOKKOS_DEDUCTION_GUIDE LAnd(View const&) - ->LAnd::memory_space>; + -> LAnd::memory_space>; template struct LOr { @@ -306,7 +306,7 @@ struct LOr { template KOKKOS_DEDUCTION_GUIDE LOr(View const&) - ->LOr::memory_space>; + -> LOr::memory_space>; template struct BAnd { @@ -353,7 +353,7 @@ struct BAnd { template KOKKOS_DEDUCTION_GUIDE BAnd(View const&) - ->BAnd::memory_space>; + -> BAnd::memory_space>; template struct BOr { @@ -400,7 +400,7 @@ struct BOr { template KOKKOS_DEDUCTION_GUIDE BOr(View const&) - ->BOr::memory_space>; + -> BOr::memory_space>; template struct ValLocScalar { @@ -438,7 +438,12 @@ struct MinLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -458,11 +463,10 @@ struct MinLoc { }; template -KOKKOS_DEDUCTION_GUIDE MinLoc( - View, Properties...> const&) - ->MinLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MinLoc(View, Properties...> const&) -> MinLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MaxLoc { @@ -494,7 +498,12 @@ struct MaxLoc { // Required KOKKOS_INLINE_FUNCTION void join(value_type& dest, const value_type& src) const { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -514,11 +523,10 @@ struct MaxLoc { }; template -KOKKOS_DEDUCTION_GUIDE MaxLoc( - View, Properties...> const&) - ->MaxLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +MaxLoc(View, Properties...> const&) -> MaxLoc< + Scalar, Index, + typename View, Properties...>::memory_space>; template struct MinMaxScalar { @@ -580,8 +588,8 @@ struct MinMax { template KOKKOS_DEDUCTION_GUIDE MinMax(View, Properties...> const&) - ->MinMax, Properties...>::memory_space>; + -> MinMax, Properties...>::memory_space>; template struct MinMaxLocScalar { @@ -622,10 +630,16 @@ struct MinMaxLoc { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { + dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -650,9 +664,9 @@ struct MinMaxLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxLoc( View, Properties...> const&) - ->MinMaxLoc, - Properties...>::memory_space>; + -> MinMaxLoc, + Properties...>::memory_space>; // -------------------------------------------------- // reducers added to support std algorithms @@ -718,9 +732,9 @@ struct MaxFirstLoc { template KOKKOS_DEDUCTION_GUIDE MaxFirstLoc( View, Properties...> const&) - ->MaxFirstLoc, - Properties...>::memory_space>; + -> MaxFirstLoc, + Properties...>::memory_space>; // // MaxFirstLocCustomComparator @@ -788,9 +802,9 @@ template KOKKOS_DEDUCTION_GUIDE MaxFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MaxFirstLocCustomComparator, - Properties...>::memory_space>; + -> MaxFirstLocCustomComparator, + Properties...>::memory_space>; // // MinFirstLoc @@ -852,9 +866,9 @@ struct MinFirstLoc { template KOKKOS_DEDUCTION_GUIDE MinFirstLoc( View, Properties...> const&) - ->MinFirstLoc, - Properties...>::memory_space>; + -> MinFirstLoc, + Properties...>::memory_space>; // // MinFirstLocCustomComparator @@ -922,9 +936,9 @@ template KOKKOS_DEDUCTION_GUIDE MinFirstLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinFirstLocCustomComparator, - Properties...>::memory_space>; + -> MinFirstLocCustomComparator, + Properties...>::memory_space>; // // MinMaxFirstLastLoc @@ -997,9 +1011,9 @@ struct MinMaxFirstLastLoc { template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLoc( View, Properties...> const&) - ->MinMaxFirstLastLoc, - Properties...>::memory_space>; + -> MinMaxFirstLastLoc, + Properties...>::memory_space>; // // MinMaxFirstLastLocCustomComparator @@ -1077,7 +1091,7 @@ template KOKKOS_DEDUCTION_GUIDE MinMaxFirstLastLocCustomComparator( View, Properties...> const&, ComparatorType) - ->MinMaxFirstLastLocCustomComparator< + -> MinMaxFirstLastLocCustomComparator< Scalar, Index, ComparatorType, typename View, Properties...>::memory_space>; @@ -1139,10 +1153,9 @@ struct FirstLoc { }; template -KOKKOS_DEDUCTION_GUIDE FirstLoc( - View, Properties...> const&) - ->FirstLoc, - Properties...>::memory_space>; +KOKKOS_DEDUCTION_GUIDE +FirstLoc(View, Properties...> const&) -> FirstLoc< + Index, typename View, Properties...>::memory_space>; // // LastLoc @@ -1202,8 +1215,8 @@ struct LastLoc { template KOKKOS_DEDUCTION_GUIDE LastLoc(View, Properties...> const&) - ->LastLoc, Properties...>::memory_space>; + -> LastLoc, + Properties...>::memory_space>; template struct StdIsPartScalar { @@ -1270,8 +1283,8 @@ struct StdIsPartitioned { template KOKKOS_DEDUCTION_GUIDE StdIsPartitioned( View, Properties...> const&) - ->StdIsPartitioned, - Properties...>::memory_space>; + -> StdIsPartitioned, + Properties...>::memory_space>; template struct StdPartPointScalar { @@ -1333,8 +1346,8 @@ struct StdPartitionPoint { template KOKKOS_DEDUCTION_GUIDE StdPartitionPoint( View, Properties...> const&) - ->StdPartitionPoint, - Properties...>::memory_space>; + -> StdPartitionPoint, + Properties...>::memory_space>; } // namespace Kokkos namespace Kokkos { @@ -1404,9 +1417,9 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< std::enable_if_t::value && - (!std::is_array::value && - !std::is_pointer::value) && - !Kokkos::is_reducer::value>, + (!std::is_array_v && + !std::is_pointer_v< + ReturnType>)&&!Kokkos::is_reducer::value>, ReturnType, FunctorType> { using return_type = Kokkos::View; @@ -1422,8 +1435,8 @@ struct ParallelReduceReturnValue< template struct ParallelReduceReturnValue< - std::enable_if_t<(std::is_array::value || - std::is_pointer::value)>, + std::enable_if_t<(std::is_array_v || + std::is_pointer_v)>, ReturnType, FunctorType> { using return_type = Kokkos::View, Kokkos::HostSpace, Kokkos::MemoryUnmanaged>; @@ -1434,7 +1447,7 @@ struct ParallelReduceReturnValue< static return_type return_value(ReturnType& return_val, const FunctorType& functor) { - if (std::is_array::value) + if (std::is_array_v) return return_type(return_val); else return return_type(return_val, functor.value_count); @@ -1467,8 +1480,7 @@ struct ParallelReducePolicyType< template struct ParallelReducePolicyType< - std::enable_if_t::value>, PolicyType, - FunctorType> { + std::enable_if_t>, PolicyType, FunctorType> { using execution_space = typename Impl::FunctorPolicyExecutionSpace::execution_space; @@ -1501,27 +1513,28 @@ struct ParallelReduceAdaptor { using PassedReducerType = typename return_value_adapter::reducer_type; uint64_t kpID = 0; - PolicyType inner_policy = policy; - Kokkos::Tools::Impl::begin_parallel_reduce( - inner_policy, functor, label, kpID); - using ReducerSelector = - Kokkos::Impl::if_c::value, + Kokkos::Impl::if_c, FunctorType, PassedReducerType>; using Analysis = FunctorAnalysis; - using CombinedFunctorReducerType = CombinedFunctorReducer; + + CombinedFunctorReducerType functor_reducer( + functor, typename Analysis::Reducer( + ReducerSelector::select(functor, return_value))); + const auto& response = Kokkos::Tools::Impl::begin_parallel_reduce< + typename return_value_adapter::reducer_type>(policy, functor_reducer, + label, kpID); + const auto& inner_policy = response.policy; + auto closure = construct_with_shared_allocation_tracking_disabled< Impl::ParallelReduce::execution_space>>( - CombinedFunctorReducerType( - functor, typename Analysis::Reducer( - ReducerSelector::select(functor, return_value))), - inner_policy, + functor_reducer, inner_policy, return_value_adapter::return_value(return_value, functor)); closure.execute(); @@ -1536,7 +1549,7 @@ struct ParallelReduceAdaptor { template static inline std::enable_if_t::value)> + std::is_pointer_v)> execute(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { execute_impl(label, policy, functor, return_value); @@ -1568,7 +1581,7 @@ struct ReducerHasTestReferenceFunction { static std::false_type test_func(...); enum { - value = std::is_same(nullptr))>::value + value = std::is_same_v(nullptr))> }; }; @@ -1611,7 +1624,7 @@ struct ParallelReduceFence { template static void fence(const ExecutionSpace& ex, const std::string& name, ArgsDeduced&&... args) { - if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced &&) args...)) { + if (Impl::parallel_reduce_needs_fence(ex, (ArgsDeduced&&)args...)) { ex.fence(name); } } @@ -1663,11 +1676,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1684,11 +1697,11 @@ template inline std::enable_if_t::value && !(Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1704,11 +1717,11 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1728,11 +1741,11 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, ReturnType& return_value) { static_assert( - !std::is_const::value, + !std::is_const_v, "A const reduction result type is only allowed for a View, pointer or " "reducer return type!"); @@ -1754,7 +1767,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const std::string& label, const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1771,7 +1784,7 @@ template inline std::enable_if_t::value && (Kokkos::is_view::value || Kokkos::is_reducer::value || - std::is_pointer::value)> + std::is_pointer_v)> parallel_reduce(const PolicyType& policy, const FunctorType& functor, const ReturnType& return_value) { ReturnType return_value_impl = return_value; @@ -1787,7 +1800,7 @@ parallel_reduce(const PolicyType& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = @@ -1806,7 +1819,7 @@ parallel_reduce(const size_t& policy, const FunctorType& functor, template inline std::enable_if_t::value || Kokkos::is_reducer::value || - std::is_pointer::value> + std::is_pointer_v> parallel_reduce(const std::string& label, const size_t& policy, const FunctorType& functor, const ReturnType& return_value) { using policy_type = diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp index e7a9ba0c7e..1759c2b4a1 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ProfileSection.hpp @@ -32,7 +32,7 @@ class [[nodiscard]] ProfilingSection { uint32_t sectionID; public: - ProfilingSection(ProfilingSection const&) = delete; + ProfilingSection(ProfilingSection const&) = delete; ProfilingSection& operator=(ProfilingSection const&) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp index f45dfa324e..a4168b9401 100644 --- a/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp +++ b/lib/kokkos/core/src/Kokkos_Profiling_ScopedRegion.hpp @@ -30,7 +30,7 @@ namespace Kokkos::Profiling { class [[nodiscard]] ScopedRegion { public: - ScopedRegion(ScopedRegion const &) = delete; + ScopedRegion(ScopedRegion const &) = delete; ScopedRegion &operator=(ScopedRegion const &) = delete; #if defined(__has_cpp_attribute) && __has_cpp_attribute(nodiscard) >= 201907 diff --git a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp index a925e32a33..f00e25fdb6 100644 --- a/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp +++ b/lib/kokkos/core/src/Kokkos_ScratchSpace.hpp @@ -110,7 +110,7 @@ class ScratchMemorySpace { // Note: for team scratch m_offset is 0, since every // thread will get back the same shared pointer void* tmp = m_iter + m_offset * size; - uintptr_t increment = size * m_multiplier; + uintptr_t increment = static_cast(size) * m_multiplier; // Cast to uintptr_t to avoid problems with pointer arithmetic using SYCL const auto end_iter = diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp index 869a5f8ec2..3edecb4502 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler.hpp @@ -14,11 +14,17 @@ // //@HEADER -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE #include + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE static_assert(false, "Including non-public Kokkos header files is not allowed."); #endif + +#ifndef KOKKOS_ENABLE_DEPRECATED_CODE_4 +#error "The tasking framework is deprecated" +#endif + #ifndef KOKKOS_TASKSCHEDULER_HPP #define KOKKOS_TASKSCHEDULER_HPP @@ -44,6 +50,11 @@ static_assert(false, //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -54,7 +65,7 @@ class TaskExec; } // end namespace Impl template -class BasicTaskScheduler : public Impl::TaskSchedulerBase { +class KOKKOS_DEPRECATED BasicTaskScheduler : public Impl::TaskSchedulerBase { public: using scheduler_type = BasicTaskScheduler; using execution_space = ExecSpace; @@ -494,8 +505,8 @@ namespace Kokkos { // Construct a TaskTeam execution policy template -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskTeam, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -503,7 +514,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskTeam( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -512,18 +524,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskTeam, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskTeam(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -531,8 +543,8 @@ Impl::TaskPolicyWithScheduler -Impl::TaskPolicyWithPredecessor> +KOKKOS_DEPRECATED Impl::TaskPolicyWithPredecessor< + Impl::TaskType::TaskSingle, Kokkos::BasicFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Kokkos::BasicFuture arg_future, TaskPriority arg_priority = TaskPriority::Regular) { @@ -540,7 +552,8 @@ Impl::TaskPolicyWithPredecessor -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler KOKKOS_INLINE_FUNCTION TaskSingle( Scheduler arg_scheduler, std::enable_if_t::value, TaskPriority> @@ -549,18 +562,18 @@ Impl::TaskPolicyWithScheduler } template -Impl::TaskPolicyWithScheduler +KOKKOS_DEPRECATED Impl::TaskPolicyWithScheduler< + Kokkos::Impl::TaskType::TaskSingle, Scheduler, PredecessorFuture> KOKKOS_INLINE_FUNCTION TaskSingle(Scheduler arg_scheduler, PredecessorFuture arg_future, std::enable_if_t::value && Kokkos::is_future::value, TaskPriority> arg_priority = TaskPriority::Regular) { - static_assert(std::is_same::value, - "Can't create a task policy from a scheduler and a future from " - "a different scheduler"); + static_assert( + std::is_same_v, + "Can't create a task policy from a scheduler and a future from " + "a different scheduler"); return {std::move(arg_scheduler), std::move(arg_future), arg_priority}; } @@ -575,7 +588,8 @@ Impl::TaskPolicyWithScheduler -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> host_spawn(Impl::TaskPolicyWithScheduler arg_policy, FunctorType&& arg_functor) { @@ -606,7 +620,8 @@ host_spawn(Impl::TaskPolicyWithScheduler */ template -typename Scheduler::template future_type_for_functor> +KOKKOS_DEPRECATED typename Scheduler::template future_type_for_functor< + std::decay_t> KOKKOS_INLINE_FUNCTION task_spawn(Impl::TaskPolicyWithScheduler arg_policy, @@ -633,7 +648,7 @@ typename Scheduler::template future_type_for_functor> * 2) High, Normal, or Low priority */ template -void KOKKOS_INLINE_FUNCTION +KOKKOS_DEPRECATED void KOKKOS_INLINE_FUNCTION respawn(FunctorType* arg_self, T const& arg, TaskPriority const& arg_priority = TaskPriority::Regular) { static_assert(Kokkos::is_future::value || Kokkos::is_scheduler::value, @@ -656,7 +671,8 @@ respawn(FunctorType* arg_self, T const& arg, // Wait for all runnable tasks to complete template -inline void wait(BasicTaskScheduler const& scheduler) { +KOKKOS_DEPRECATED inline void wait( + BasicTaskScheduler const& scheduler) { using scheduler_type = BasicTaskScheduler; scheduler_type::specialization::execute(scheduler); // scheduler.m_queue->execute(); @@ -664,6 +680,10 @@ inline void wait(BasicTaskScheduler const& scheduler) { } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp index 203fb16eaf..83e1c06db9 100644 --- a/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp +++ b/lib/kokkos/core/src/Kokkos_TaskScheduler_fwd.hpp @@ -31,31 +31,40 @@ static_assert(false, #include //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { // Forward declarations used in Impl::TaskQueue template -class BasicFuture; +class KOKKOS_DEPRECATED BasicFuture; template -class SimpleTaskScheduler; +class KOKKOS_DEPRECATED SimpleTaskScheduler; template -class BasicTaskScheduler; +class KOKKOS_DEPRECATED BasicTaskScheduler; template -struct is_scheduler : public std::false_type {}; +struct KOKKOS_DEPRECATED is_scheduler : public std::false_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; template -struct is_scheduler> : public std::true_type { -}; +struct KOKKOS_DEPRECATED is_scheduler> + : public std::true_type {}; -enum class TaskPriority : int { High = 0, Regular = 1, Low = 2 }; +enum class KOKKOS_DEPRECATED TaskPriority : int { + High = 0, + Regular = 1, + Low = 2 +}; } // namespace Kokkos @@ -141,28 +150,28 @@ using default_tasking_memory_space_for_execution_space_t = namespace Kokkos { template -using DeprecatedTaskScheduler = BasicTaskScheduler< +using DeprecatedTaskScheduler KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using DeprecatedTaskSchedulerMultiple = BasicTaskScheduler< +using DeprecatedTaskSchedulerMultiple KOKKOS_DEPRECATED = BasicTaskScheduler< Space, Impl::TaskQueueMultiple< Space, Impl::default_tasking_memory_space_for_execution_space_t>>; template -using TaskScheduler = SimpleTaskScheduler< +using TaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::SingleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, Impl::TaskQueueTraitsLockBased>>; template -using TaskSchedulerMultiple = SimpleTaskScheduler< +using TaskSchedulerMultiple KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -172,7 +181,7 @@ using TaskSchedulerMultiple = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -using ChaseLevTaskScheduler = SimpleTaskScheduler< +using ChaseLevTaskScheduler KOKKOS_DEPRECATED = SimpleTaskScheduler< Space, Impl::MultipleTaskQueue< Space, Impl::default_tasking_memory_space_for_execution_space_t, @@ -182,7 +191,7 @@ using ChaseLevTaskScheduler = SimpleTaskScheduler< Impl::default_tasking_memory_space_for_execution_space_t>>>>; template -void wait(BasicTaskScheduler const&); +KOKKOS_DEPRECATED void wait(BasicTaskScheduler const&); namespace Impl { @@ -204,6 +213,10 @@ struct TaskPolicyData; } // namespace Kokkos +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_POP() +#endif + //---------------------------------------------------------------------------- #endif /* #if defined( KOKKOS_ENABLE_TASKDAG ) */ diff --git a/lib/kokkos/core/src/Kokkos_Timer.hpp b/lib/kokkos/core/src/Kokkos_Timer.hpp index a210b6ff18..ab31484d76 100644 --- a/lib/kokkos/core/src/Kokkos_Timer.hpp +++ b/lib/kokkos/core/src/Kokkos_Timer.hpp @@ -48,7 +48,7 @@ class Timer { inline Timer() { reset(); } - Timer(const Timer&) = delete; + Timer(const Timer&) = delete; Timer& operator=(const Timer&) = delete; inline double seconds() const { diff --git a/lib/kokkos/core/src/Kokkos_Tuners.hpp b/lib/kokkos/core/src/Kokkos_Tuners.hpp index f5ffc66af5..fcb061b378 100644 --- a/lib/kokkos/core/src/Kokkos_Tuners.hpp +++ b/lib/kokkos/core/src/Kokkos_Tuners.hpp @@ -52,6 +52,8 @@ VariableValue make_variable_value(size_t, int64_t); VariableValue make_variable_value(size_t, double); SetOrRange make_candidate_range(double lower, double upper, double step, bool openLower, bool openUpper); +SetOrRange make_candidate_range(int64_t lower, int64_t upper, int64_t step, + bool openLower, bool openUpper); size_t get_new_context_id(); void begin_context(size_t context_id); void end_context(size_t context_id); @@ -412,18 +414,19 @@ class TeamSizeTuner : public ExtendableTunerMixin { TunerType tuner; public: - TeamSizeTuner() = default; + TeamSizeTuner() = default; TeamSizeTuner& operator=(const TeamSizeTuner& other) = default; TeamSizeTuner(const TeamSizeTuner& other) = default; - TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; - TeamSizeTuner(TeamSizeTuner&& other) = default; + TeamSizeTuner& operator=(TeamSizeTuner&& other) = default; + TeamSizeTuner(TeamSizeTuner&& other) = default; template TeamSizeTuner(const std::string& name, - Kokkos::TeamPolicy& policy, + const Kokkos::TeamPolicy& policy_in, const Functor& functor, const TagType& tag, ViableConfigurationCalculator calc) { - using PolicyType = Kokkos::TeamPolicy; + using PolicyType = Kokkos::TeamPolicy; + PolicyType policy(policy_in); auto initial_vector_length = policy.impl_vector_length(); if (initial_vector_length < 1) { policy.impl_set_vector_length(1); @@ -505,7 +508,8 @@ class TeamSizeTuner : public ExtendableTunerMixin { } template - void tune(Kokkos::TeamPolicy& policy) { + auto tune(const Kokkos::TeamPolicy& policy_in) { + Kokkos::TeamPolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); auto team_size = std::get<1>(configuration); @@ -515,6 +519,111 @@ class TeamSizeTuner : public ExtendableTunerMixin { policy.impl_set_vector_length(vector_length); } } + return policy; + } + void end() { + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + tuner.end(); + } + } + + TunerType get_tuner() const { return tuner; } +}; +namespace Impl { +template +struct tuning_type_for; + +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_double; + static double get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.double_value; + } +}; +template <> +struct tuning_type_for { + static constexpr Kokkos::Tools::Experimental::ValueType value = + Kokkos::Tools::Experimental::ValueType::kokkos_value_int64; + static int64_t get( + const Kokkos::Tools::Experimental::VariableValue& value_struct) { + return value_struct.value.int_value; + } +}; +} // namespace Impl +template +class SingleDimensionalRangeTuner { + size_t id; + size_t context; + using tuning_util = Impl::tuning_type_for; + + Bound default_value; + + public: + SingleDimensionalRangeTuner() = default; + SingleDimensionalRangeTuner( + const std::string& name, + Kokkos::Tools::Experimental::StatisticalCategory category, + Bound default_val, Bound lower, Bound upper, Bound step = (Bound)0) { + default_value = default_val; + Kokkos::Tools::Experimental::VariableInfo info; + info.category = category; + info.candidates = make_candidate_range( + static_cast(lower), static_cast(upper), + static_cast(step), false, false); + info.valueQuantity = + Kokkos::Tools::Experimental::CandidateValueType::kokkos_value_range; + info.type = tuning_util::value; + id = Kokkos::Tools::Experimental::declare_output_type(name, info); + } + + Bound begin() { + context = Kokkos::Tools::Experimental::get_new_context_id(); + Kokkos::Tools::Experimental::begin_context(context); + auto tuned_value = + Kokkos::Tools::Experimental::make_variable_value(id, default_value); + Kokkos::Tools::Experimental::request_output_values(context, 1, + &tuned_value); + return tuning_util::get(tuned_value); + } + + void end() { Kokkos::Tools::Experimental::end_context(context); } + + template + void with_tuned_value(Functor& func) { + func(begin()); + end(); + } +}; + +class RangePolicyOccupancyTuner { + private: + using TunerType = SingleDimensionalRangeTuner; + TunerType tuner; + + public: + RangePolicyOccupancyTuner() = default; + template + RangePolicyOccupancyTuner(const std::string& name, + const Kokkos::RangePolicy&, + const Functor&, const TagType&, + ViableConfigurationCalculator) + : tuner(TunerType(name, + Kokkos::Tools::Experimental::StatisticalCategory:: + kokkos_value_ratio, + 100, 5, 100, 5)) {} + + template + auto tune(const Kokkos::RangePolicy& policy_in) { + Kokkos::RangePolicy policy(policy_in); + if (Kokkos::Tools::Experimental::have_tuning_tool()) { + auto occupancy = tuner.begin(); + policy.impl_set_desired_occupancy( + Kokkos::Experimental::DesiredOccupancy{static_cast(occupancy)}); + } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { @@ -578,11 +687,13 @@ struct MDRangeTuner : public ExtendableTunerMixin> { policy.impl_change_tile_size({std::get(tuple)...}); } template - void tune(Kokkos::MDRangePolicy& policy) { + auto tune(const Kokkos::MDRangePolicy& policy_in) { + Kokkos::MDRangePolicy policy(policy_in); if (Kokkos::Tools::Experimental::have_tuning_tool()) { auto configuration = tuner.begin(); set_policy_tile(policy, configuration, std::make_index_sequence{}); } + return policy; } void end() { if (Kokkos::Tools::Experimental::have_tuning_tool()) { diff --git a/lib/kokkos/core/src/Kokkos_TypeInfo.hpp b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp new file mode 100644 index 0000000000..e5710da2e3 --- /dev/null +++ b/lib/kokkos/core/src/Kokkos_TypeInfo.hpp @@ -0,0 +1,103 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_TYPE_INFO_HPP +#define KOKKOS_TYPE_INFO_HPP + +#include +#include +#include + +#include + +// Intel C++ Compiler Classic version 2021.2.0 works but 2021.1.2 doesn't +// Both have __INTEL_COMPILER defined to 2021 so using +// __INTEL_COMPILER_BUILD_DATE to discriminate. +// Experimenting on the compiler explorer gave +// icc version | __INTEL_COMPILER | __INTEL_COMPILER_BUILD_DATE +// 2021.1.2 | 2021 | 20201208 +// 2021.2.0 | 2021 | 20210228 +// NVCC versions less than 11.3.0 segfault when that header is included +// NVCC+MSVC doesn't work at all - it simply reports "T" inside type_name +#if (!defined(KOKKOS_COMPILER_INTEL) || \ + (__INTEL_COMPILER_BUILD_DATE >= 20210228)) && \ + (!defined(KOKKOS_COMPILER_NVCC) || (KOKKOS_COMPILER_NVCC >= 1130)) && \ + (!(defined(KOKKOS_COMPILER_NVCC) && defined(KOKKOS_COMPILER_MSVC))) + +#define KOKKOS_ENABLE_IMPL_TYPEINFO + +namespace Kokkos::Impl { + +template +constexpr std::array to_array(std::string_view src) { + std::array dst{}; + for (size_t i = 0; i < N; ++i) { + dst[i] = src[i]; + } + return dst; +} + +template +constexpr auto type_name() { +#if defined(__clang__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(__GNUC__) + constexpr std::string_view func = __PRETTY_FUNCTION__; + constexpr std::string_view prefix{"[with T = "}; + constexpr std::string_view suffix{"]"}; +#elif defined(_MSC_VER) + constexpr std::string_view func = __FUNCSIG__; + constexpr std::string_view prefix{"type_name<"}; + constexpr std::string_view suffix{">(void)"}; +#else +#error bug +#endif + constexpr auto beg = func.find(prefix) + prefix.size(); + constexpr auto end = func.rfind(suffix); + static_assert(beg != std::string_view::npos); + static_assert(end != std::string_view::npos); + return to_array(func.substr(beg, end)); +} + +template +class TypeInfo { + static constexpr auto value_ = type_name(); + + public: + static constexpr std::string_view name() noexcept { + return {value_.data(), value_.size()}; + } +}; + +} // namespace Kokkos::Impl + +#else // out of luck, using Intel C++ Compiler Classic + +namespace Kokkos::Impl { + +template +class TypeInfo { + public: + static constexpr std::string_view name() noexcept { return "not supported"; } +}; + +} // namespace Kokkos::Impl + +#endif + +#endif diff --git a/lib/kokkos/core/src/Kokkos_View.hpp b/lib/kokkos/core/src/Kokkos_View.hpp index 04d1fcf151..d5b352876c 100644 --- a/lib/kokkos/core/src/Kokkos_View.hpp +++ b/lib/kokkos/core/src/Kokkos_View.hpp @@ -22,2016 +22,10 @@ static_assert(false, #ifndef KOKKOS_VIEW_HPP #define KOKKOS_VIEW_HPP -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -#include -#include -#include -#endif -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct ViewArrayAnalysis; - -template ::non_const_value_type> -struct ViewDataAnalysis; - -template -class ViewMapping { - public: - enum : bool { is_assignable_data_type = false }; - enum : bool { is_assignable = false }; -}; - -template -constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( - const IntType i0, const IntType i1, const IntType i2, const IntType i3, - const IntType i4, const IntType i5, const IntType i6, const IntType i7) { - static_assert(std::is_integral::value, - "count_valid_integers() must have integer arguments."); - - return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + - (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + - (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + - (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); -} - -// FIXME Ideally, we would not instantiate this function for every possible View -// type. We should be able to only pass "extent" when we use mdspan. -template -KOKKOS_INLINE_FUNCTION void runtime_check_rank( - const View&, const bool is_void_spec, const size_t i0, const size_t i1, - const size_t i2, const size_t i3, const size_t i4, const size_t i5, - const size_t i6, const size_t i7, const char* label) { - (void)(label); - - if (is_void_spec) { - const size_t num_passed_args = - count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); - // We either allow to pass as many extents as the dynamic rank is, or - // as many extents as the total rank is. In the latter case, the given - // extents for the static dimensions must match the - // compile-time extents. - constexpr int rank = View::rank(); - constexpr int dyn_rank = View::rank_dynamic(); - const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; - const bool n_args_is_rank = num_passed_args == rank; - - if constexpr (rank != dyn_rank) { - if (n_args_is_rank) { - size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; - for (int i = dyn_rank; i < rank; ++i) - if (new_extents[i] != View::static_extent(i)) { - KOKKOS_IF_ON_HOST( - const std::string message = - "The specified run-time extent for Kokkos::View '" + - std::string(label) + - "' does not match the compile-time extent in dimension " + - std::to_string(i) + ". The given extent is " + - std::to_string(new_extents[i]) + " but should be " + - std::to_string(View::static_extent(i)) + ".\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE( - Kokkos::abort( - "The specified run-time extents for a Kokkos::View " - "do not match the compile-time extents.");) - } - } - } - - if (!n_args_is_dyn_rank && !n_args_is_rank) { - KOKKOS_IF_ON_HOST( - const std::string message = - "Constructor for Kokkos::View '" + std::string(label) + - "' has mismatched number of arguments. The number " - "of arguments = " + - std::to_string(num_passed_args) + - " neither matches the dynamic rank = " + - std::to_string(dyn_rank) + - " nor the total rank = " + std::to_string(rank) + "\n"; - Kokkos::abort(message.c_str());) - KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " - "mismatched number of arguments.");) - } - } -} - -} /* namespace Impl */ -} /* namespace Kokkos */ - -// Class to provide a uniform type -namespace Kokkos { -namespace Impl { -template -struct ViewUniformType; -} -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -/** \class ViewTraits - * \brief Traits class for accessing attributes of a View. - * - * This is an implementation detail of View. It is only of interest - * to developers implementing a new specialization of View. - * - * Template argument options: - * - View< DataType > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , ArrayLayout > - * - View< DataType , ArrayLayout , Space > - * - View< DataType , ArrayLayout , MemoryTraits > - * - View< DataType , ArrayLayout , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - */ - -template -struct ViewTraits; - -template <> -struct ViewTraits { - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = void; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - // Ignore an extraneous 'void' - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - HooksPolicy, Prop...> { - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = typename ViewTraits::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = HooksPolicy; -}; - -template -struct ViewTraits::value>, - ArrayLayout, Prop...> { - // Specify layout, keep subsequent space and memory traits arguments - - using execution_space = typename ViewTraits::execution_space; - using memory_space = typename ViewTraits::memory_space; - using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; - using array_layout = ArrayLayout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits::value>, Space, - Prop...> { - // Specify Space, memory traits should be the only subsequent argument. - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::HostMirrorSpace, - void>::value && - std::is_same::array_layout, - void>::value, - "Only one View Execution or Memory Space template argument"); - - using execution_space = typename Space::execution_space; - using memory_space = typename Space::memory_space; - using HostMirrorSpace = - typename Kokkos::Impl::HostMirror::Space::memory_space; - using array_layout = typename execution_space::array_layout; - using memory_traits = typename ViewTraits::memory_traits; - using specialize = typename ViewTraits::specialize; - using hooks_policy = typename ViewTraits::hooks_policy; -}; - -template -struct ViewTraits< - std::enable_if_t::value>, - MemoryTraits, Prop...> { - // Specify memory trait, should not be any subsequent arguments - - static_assert( - std::is_same::execution_space, - void>::value && - std::is_same::memory_space, - void>::value && - std::is_same::array_layout, - void>::value && - std::is_same::memory_traits, - void>::value && - std::is_same::hooks_policy, - void>::value, - "MemoryTrait is the final optional template argument for a View"); - - using execution_space = void; - using memory_space = void; - using HostMirrorSpace = void; - using array_layout = void; - using memory_traits = MemoryTraits; - using specialize = void; - using hooks_policy = void; -}; - -template -struct ViewTraits { - private: - // Unpack the properties arguments - using prop = ViewTraits; - - using ExecutionSpace = - std::conditional_t::value, - typename prop::execution_space, - Kokkos::DefaultExecutionSpace>; - - using MemorySpace = - std::conditional_t::value, - typename prop::memory_space, - typename ExecutionSpace::memory_space>; - - using ArrayLayout = - std::conditional_t::value, - typename prop::array_layout, - typename ExecutionSpace::array_layout>; - - using HostMirrorSpace = std::conditional_t< - !std::is_void::value, - typename prop::HostMirrorSpace, - typename Kokkos::Impl::HostMirror::Space>; - - using MemoryTraits = - std::conditional_t::value, - typename prop::memory_traits, - typename Kokkos::MemoryManaged>; - - using HooksPolicy = - std::conditional_t::value, - typename prop::hooks_policy, - Kokkos::Experimental::DefaultViewHooks>; - - // Analyze data type's properties, - // May be specialized based upon the layout and value type - using data_analysis = Kokkos::Impl::ViewDataAnalysis; - - public: - //------------------------------------ - // Data type traits: - - using data_type = typename data_analysis::type; - using const_data_type = typename data_analysis::const_type; - using non_const_data_type = typename data_analysis::non_const_type; - - //------------------------------------ - // Compatible array of trivial type traits: - - using scalar_array_type = typename data_analysis::scalar_array_type; - using const_scalar_array_type = - typename data_analysis::const_scalar_array_type; - using non_const_scalar_array_type = - typename data_analysis::non_const_scalar_array_type; - - //------------------------------------ - // Value type traits: - - using value_type = typename data_analysis::value_type; - using const_value_type = typename data_analysis::const_value_type; - using non_const_value_type = typename data_analysis::non_const_value_type; - - //------------------------------------ - // Mapping traits: - - using array_layout = ArrayLayout; - using dimension = typename data_analysis::dimension; - - using specialize = std::conditional_t< - std::is_void::value, - typename prop::specialize, - typename data_analysis::specialize>; /* mapping specialization tag */ - - static constexpr unsigned rank = dimension::rank; - static constexpr unsigned rank_dynamic = dimension::rank_dynamic; - - //------------------------------------ - // Execution space, memory space, memory access traits, and host mirror space. - - using execution_space = ExecutionSpace; - using memory_space = MemorySpace; - using device_type = Kokkos::Device; - using memory_traits = MemoryTraits; - using host_mirror_space = HostMirrorSpace; - using hooks_policy = HooksPolicy; - - using size_type = typename MemorySpace::size_type; - - enum { is_hostspace = std::is_same::value }; - enum { is_managed = MemoryTraits::is_unmanaged == 0 }; - enum { is_random_access = MemoryTraits::is_random_access == 1 }; - - //------------------------------------ -}; - -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN -namespace Impl { -struct UnsupportedKokkosArrayLayout; - -template -struct MDSpanViewTraits { - using mdspan_type = UnsupportedKokkosArrayLayout; -}; - -// "Natural" mdspan for a view if the View's ArrayLayout is supported. -template -struct MDSpanViewTraits::type>> { - using index_type = std::size_t; - using extents_type = - typename Impl::ExtentsFromDataType::type; - using mdspan_layout_type = - typename Impl::LayoutFromArrayLayout::type; - using accessor_type = Impl::SpaceAwareAccessor< - typename Traits::memory_space, - Kokkos::default_accessor>; - using mdspan_type = mdspan; -}; -} // namespace Impl -#endif // KOKKOS_ENABLE_IMPL_MDSPAN - -/** \class View - * \brief View to an array of data. - * - * A View represents an array of one or more dimensions. - * For details, please refer to Kokkos' tutorial materials. - * - * \section Kokkos_View_TemplateParameters Template parameters - * - * This class has both required and optional template parameters. The - * \c DataType parameter must always be provided, and must always be - * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are - * placeholders for different template parameters. The default value - * of the fifth template parameter \c Specialize suffices for most use - * cases. When explaining the template parameters, we won't refer to - * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer - * to the valid categories of template parameters, in whatever order - * they may occur. - * - * Valid ways in which template arguments may be specified: - * - View< DataType > - * - View< DataType , Layout > - * - View< DataType , Layout , Space > - * - View< DataType , Layout , Space , MemoryTraits > - * - View< DataType , Space > - * - View< DataType , Space , MemoryTraits > - * - View< DataType , MemoryTraits > - * - * \tparam DataType (required) This indicates both the type of each - * entry of the array, and the combination of compile-time and - * run-time array dimension(s). For example, double* - * indicates a one-dimensional array of \c double with run-time - * dimension, and int*[3] a two-dimensional array of \c int - * with run-time first dimension and compile-time second dimension - * (of 3). In general, the run-time dimensions (if any) must go - * first, followed by zero or more compile-time dimensions. For - * more examples, please refer to the tutorial materials. - * - * \tparam Space (required) The memory space. - * - * \tparam Layout (optional) The array's layout in memory. For - * example, LayoutLeft indicates a column-major (Fortran style) - * layout, and LayoutRight a row-major (C style) layout. If not - * specified, this defaults to the preferred layout for the - * Space. - * - * \tparam MemoryTraits (optional) Assertion of the user's intended - * access behavior. For example, RandomAccess indicates read-only - * access with limited spatial locality, and Unmanaged lets users - * wrap externally allocated memory in a View without automatic - * deallocation. - * - * \section Kokkos_View_MT MemoryTraits discussion - * - * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on - * Space - * - * Some \c MemoryTraits options may have different interpretations for - * different \c Space types. For example, with the Cuda device, - * \c RandomAccess tells Kokkos to fetch the data through the texture - * cache, whereas the non-GPU devices have no such hardware construct. - * - * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits - * - * Users should defer applying the optional \c MemoryTraits parameter - * until the point at which they actually plan to rely on it in a - * computational kernel. This minimizes the number of template - * parameters exposed in their code, which reduces the cost of - * compilation. Users may always assign a View without specified - * \c MemoryTraits to a compatible View with that specification. - * For example: - * \code - * // Pass in the simplest types of View possible. - * void - * doSomething (View out, - * View in) - * { - * // Assign the "generic" View in to a RandomAccess View in_rr. - * // Note that RandomAccess View objects must have const data. - * View in_rr = in; - * // ... do something with in_rr and out ... - * } - * \endcode - */ - -} // namespace Kokkos - -namespace Kokkos { - -template -struct is_always_assignable_impl; - -template -struct is_always_assignable_impl, - Kokkos::View> { - using mapping_type = Kokkos::Impl::ViewMapping< - typename Kokkos::View::traits, - typename Kokkos::View::traits, - typename Kokkos::View::traits::specialize>; - - constexpr static bool value = - mapping_type::is_assignable && - static_cast(Kokkos::View::rank_dynamic) >= - static_cast(Kokkos::View::rank_dynamic); -}; - -template -using is_always_assignable = is_always_assignable_impl< - std::remove_reference_t, - std::remove_const_t>>; - -template -inline constexpr bool is_always_assignable_v = - is_always_assignable::value; - -template -constexpr bool is_assignable(const Kokkos::View& dst, - const Kokkos::View& src) { - using DstTraits = typename Kokkos::View::traits; - using SrcTraits = typename Kokkos::View::traits; - using mapping_type = - Kokkos::Impl::ViewMapping; - - return is_always_assignable_v, - Kokkos::View> || - (mapping_type::is_assignable && - ((DstTraits::dimension::rank_dynamic >= 1) || - (dst.static_extent(0) == src.extent(0))) && - ((DstTraits::dimension::rank_dynamic >= 2) || - (dst.static_extent(1) == src.extent(1))) && - ((DstTraits::dimension::rank_dynamic >= 3) || - (dst.static_extent(2) == src.extent(2))) && - ((DstTraits::dimension::rank_dynamic >= 4) || - (dst.static_extent(3) == src.extent(3))) && - ((DstTraits::dimension::rank_dynamic >= 5) || - (dst.static_extent(4) == src.extent(4))) && - ((DstTraits::dimension::rank_dynamic >= 6) || - (dst.static_extent(5) == src.extent(5))) && - ((DstTraits::dimension::rank_dynamic >= 7) || - (dst.static_extent(6) == src.extent(6))) && - ((DstTraits::dimension::rank_dynamic >= 8) || - (dst.static_extent(7) == src.extent(7)))); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with -// the OpenMPTarget backend -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp declare target +#if defined(KOKKOS_ENABLE_IMPL_MDSPAN) && !defined(KOKKOS_COMPILER_INTEL) +#include #endif -inline constexpr Kokkos::ALL_t ALL{}; +#include -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) -#pragma omp end declare target -#endif - -inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; - -inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; - -inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; - -/** \brief Create View allocation parameter bundle from argument list. - * - * Valid argument list members are: - * 1) label as a "string" or std::string - * 2) memory space instance of the View::memory_space type - * 3) execution space instance compatible with the View::memory_space - * 4) Kokkos::WithoutInitializing to bypass initialization - * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory - * alignment - */ -template -inline Impl::ViewCtorProp::type...> -view_alloc(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_pointer, - "Cannot give pointer-to-memory for view allocation"); - - return return_type(args...); -} - -template -KOKKOS_INLINE_FUNCTION - Impl::ViewCtorProp::type...> - view_wrap(Args const&... args) { - using return_type = - Impl::ViewCtorProp::type...>; - - static_assert(!return_type::has_memory_space && - !return_type::has_execution_space && - !return_type::has_label && return_type::has_pointer, - "Must only give pointer-to-memory for view wrapping"); - - return return_type(args...); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -class View; - -template -struct is_view : public std::false_type {}; - -template -struct is_view> : public std::true_type {}; - -template -struct is_view> : public std::true_type {}; - -template -inline constexpr bool is_view_v = is_view::value; - -template -class View : public ViewTraits { - private: - template - friend class View; - template - friend class Kokkos::Impl::ViewMapping; - - using view_tracker_type = Kokkos::Impl::ViewTracker; - - public: - using traits = ViewTraits; - - private: - using map_type = - Kokkos::Impl::ViewMapping; - template - friend struct Kokkos::Impl::ViewTracker; - using hooks_policy = typename traits::hooks_policy; - - view_tracker_type m_track; - map_type m_map; - - public: - //---------------------------------------- - /** \brief Compatible view of array of scalar types */ - using array_type = - View; - - /** \brief Compatible view of const data type */ - using const_type = - View; - - /** \brief Compatible view of non-const data type */ - using non_const_type = - View; - - /** \brief Compatible HostMirror view */ - using HostMirror = - View, - typename traits::hooks_policy>; - - /** \brief Compatible HostMirror view */ - using host_mirror_type = - View; - - /** \brief Unified types */ - using uniform_type = typename Impl::ViewUniformType::type; - using uniform_const_type = - typename Impl::ViewUniformType::const_type; - using uniform_runtime_type = - typename Impl::ViewUniformType::runtime_type; - using uniform_runtime_const_type = - typename Impl::ViewUniformType::runtime_const_type; - using uniform_nomemspace_type = - typename Impl::ViewUniformType::nomemspace_type; - using uniform_const_nomemspace_type = - typename Impl::ViewUniformType::const_nomemspace_type; - using uniform_runtime_nomemspace_type = - typename Impl::ViewUniformType::runtime_nomemspace_type; - using uniform_runtime_const_nomemspace_type = - typename Impl::ViewUniformType::runtime_const_nomemspace_type; - - //---------------------------------------- - // Domain rank and extents - - static constexpr Impl::integral_constant - rank = {}; - static constexpr Impl::integral_constant - rank_dynamic = {}; -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = - map_type::Rank}; -#endif - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - extent(const iType& r) const noexcept { - return m_map.extent(r); - } - - static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( - const unsigned r) noexcept { - return map_type::static_extent(r); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, int> - extent_int(const iType& r) const noexcept { - return static_cast(m_map.extent(r)); - } - - KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() - const { - return m_map.layout(); - } - - //---------------------------------------- - /* Deprecate all 'dimension' functions in favor of - * ISO/C++ vocabulary 'extent'. - */ - - KOKKOS_INLINE_FUNCTION constexpr size_t size() const { - return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * - m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * - m_map.dimension_6() * m_map.dimension_7(); - } - - KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { - return m_map.stride_0(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { - return m_map.stride_1(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { - return m_map.stride_2(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { - return m_map.stride_3(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { - return m_map.stride_4(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { - return m_map.stride_5(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { - return m_map.stride_6(); - } - KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { - return m_map.stride_7(); - } - - template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t< - std::is_integral::value, size_t> - stride(iType r) const { - return ( - r == 0 - ? m_map.stride_0() - : (r == 1 - ? m_map.stride_1() - : (r == 2 - ? m_map.stride_2() - : (r == 3 - ? m_map.stride_3() - : (r == 4 - ? m_map.stride_4() - : (r == 5 - ? m_map.stride_5() - : (r == 6 - ? m_map.stride_6() - : m_map.stride_7()))))))); - } - - template - KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { - m_map.stride(s); - } - - //---------------------------------------- - // Range span is the span which contains all members. - - using reference_type = typename map_type::reference_type; - using pointer_type = typename map_type::pointer_type; - - enum { - reference_type_is_lvalue_reference = - std::is_lvalue_reference::value - }; - - KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } - KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { - return m_map.span_is_contiguous(); - } - KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { - return m_map.data() != nullptr; - } - KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { - return m_map.data(); - } - - //---------------------------------------- - // Allow specializations to query their specialized map - - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::ViewMapping& - impl_map() const { - return m_map; - } - KOKKOS_INLINE_FUNCTION - const Kokkos::Impl::SharedAllocationTracker& impl_track() const { - return m_track.m_tracker; - } - //---------------------------------------- - - private: - static constexpr bool is_layout_left = - std::is_same::value; - - static constexpr bool is_layout_right = - std::is_same::value; - - static constexpr bool is_layout_stride = - std::is_same::value; - - static constexpr bool is_default_map = - std::is_void::value && - (is_layout_left || is_layout_right || is_layout_stride); - -#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); \ - Kokkos::Impl::view_verify_operator_bounds( \ - __VA_ARGS__); - -#else - -#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ - Kokkos::Impl::runtime_check_memory_access_violation< \ - typename traits::memory_space>( \ - "Kokkos::View ERROR: attempt to access inaccessible memory space", \ - __VA_ARGS__); - -#endif - - template - static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { - static_assert(rank <= sizeof...(Is)); - static_assert(sizeof...(Is) <= 8); - static_assert(Kokkos::Impl::are_integral::value); - } - - template - static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { - static_assert(rank == sizeof...(Is)); - static_assert(Kokkos::Impl::are_integral::value); - } - - public: - //------------------------------ - // Rank 1 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - operator()(I0 i0) const { - check_operator_parens_valid_args(i0); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 1 operator[] - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && !is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && - is_default_map && is_layout_stride), - reference_type> - operator[](I0 i0) const { - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 default map operator() - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 == rank) && is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - operator()(I0 i0, I1 i1) const { - check_operator_parens_valid_args(i0, i1); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which - // have "inlined" versions above - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && // - (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && // - ((0 == rank) || !is_default_map)), - reference_type> - operator()(Is... indices) const { - check_operator_parens_valid_args(indices...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) - return m_map.reference(indices...); - } - - //------------------------------ - // Rank 0 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> - access(Is... extra) const { - check_access_member_function_valid_args(extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) - return m_map.reference(); - } - - //------------------------------ - // Rank 1 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && !is_default_map), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.reference(i0); - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && !is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[i0]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (1 == rank) && is_default_map && is_layout_stride), - reference_type> - access(I0 i0, Is... extra) const { - check_access_member_function_valid_args(i0, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; - } - - //------------------------------ - // Rank 2 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (2 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - return m_map.reference(i0, i1); - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (2 == rank) && - is_default_map && - (is_layout_left || is_layout_right || is_layout_stride)), - reference_type> - access(I0 i0, I1 i1, Is... extra) const { - check_access_member_function_valid_args(i0, i1, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) - if constexpr (is_layout_left) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; - else - return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; - } else if constexpr (is_layout_right) { - if constexpr (rank_dynamic == 0) - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; - else - return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; - } else { - static_assert(is_layout_stride); - return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + - i1 * m_map.m_impl_offset.m_stride.S1]; - } -#if defined KOKKOS_COMPILER_INTEL - __builtin_unreachable(); -#endif - } - - //------------------------------ - // Rank 3 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (3 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) - return m_map.reference(i0, i1, i2); - } - - //------------------------------ - // Rank 4 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && (4 == rank) && - !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) - return m_map.reference(i0, i1, i2, i3); - } - - //------------------------------ - // Rank 5 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (5 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, - extra...) - return m_map.reference(i0, i1, i2, i3, i4); - } - - //------------------------------ - // Rank 6 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (6 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5); - } - - //------------------------------ - // Rank 7 - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - (Kokkos::Impl::always_true::value && - (7 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6); - } - - //------------------------------ - // Rank 8 - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map - .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; - } - - template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(Kokkos::Impl::always_true::value && - (8 == rank) && !is_default_map), - reference_type> - access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, - Is... extra) const { - check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, - extra...); - KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, - i7, extra...) - return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); - } - -#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY - - //---------------------------------------- - // Standard destructor, constructors, and assignment operators - - KOKKOS_DEFAULTED_FUNCTION - ~View() = default; - - KOKKOS_DEFAULTED_FUNCTION - View() = default; - - KOKKOS_FUNCTION - View(const View& other) : m_track(other.m_track), m_map(other.m_map) { - KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View(View&& other) - : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { - KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) - } - - KOKKOS_FUNCTION - View& operator=(const View& other) { - m_map = other.m_map; - m_track = other.m_track; - - KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) - - return *this; - } - - KOKKOS_FUNCTION - View& operator=(View&& other) { - m_map = std::move(other.m_map); - m_track = std::move(other.m_track); - - KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) - - return *this; - } - - //---------------------------------------- - // Compatible view copy constructor and assignment - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View( - const View& rhs, - std::enable_if_t::traits, - typename traits::specialize>::is_assignable_data_type>* = nullptr) - : m_track(rhs), m_map() { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - } - - template - KOKKOS_INLINE_FUNCTION std::enable_if_t< - Kokkos::Impl::ViewMapping< - traits, typename View::traits, - typename traits::specialize>::is_assignable_data_type, - View>& - operator=(const View& rhs) { - using SrcTraits = typename View::traits; - using Mapping = Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); - Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); - m_track.assign(rhs); - return *this; - } - - //---------------------------------------- - // Compatible subview constructor - // may assign unmanaged from managed. - - template - KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, - Args... args) - : m_track(src_view), m_map() { - using SrcType = View; - - using Mapping = Kokkos::Impl::ViewMapping; - - using DstType = typename Mapping::type; - - static_assert( - Kokkos::Impl::ViewMapping::is_assignable, - "Subview construction requires compatible view and subview arguments"); - - Mapping::assign(m_map, src_view.m_map, arg0, args...); - } - - //---------------------------------------- - // Allocation tracking properties - - KOKKOS_INLINE_FUNCTION - int use_count() const { return m_track.m_tracker.use_count(); } - - inline const std::string label() const { - return m_track.m_tracker - .template get_label(); - } - - public: - //---------------------------------------- - // Allocation according to allocation properties and array layout - - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track(), m_map() { - // Copy the input allocation properties with possibly defaulted properties - // We need to split it in two to avoid MSVC compiler errors - auto prop_copy_tmp = - Impl::with_properties_if_unset(arg_prop, std::string{}); - auto prop_copy = Impl::with_properties_if_unset( - prop_copy_tmp, typename traits::device_type::memory_space{}, - typename traits::device_type::execution_space{}); - using alloc_prop = decltype(prop_copy); - - static_assert(traits::is_managed, - "View allocation constructor requires managed memory"); - - if (alloc_prop::initialize && - !alloc_prop::execution_space::impl_is_initialized()) { - // If initializing view data then - // the execution space must be initialized. - Kokkos::Impl::throw_runtime_exception( - "Constructing View and initializing data with uninitialized " - "execution space"); - } - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - const std::string& alloc_name = - Impl::get_property(prop_copy); - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, alloc_name.c_str()); - } -#endif - - Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( - prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); - - // Setup and initialization complete, start tracking - m_track.m_tracker.assign_allocated_record_to_uninitialized(record); - } - - KOKKOS_INLINE_FUNCTION - void assign_data(pointer_type arg_data) { - m_track.m_tracker.clear(); - m_map.assign_data(arg_data); - } - - // Wrap memory according to properties and array layout - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, - typename traits::array_layout> const& arg_layout) - : m_track() // No memory tracking - , - m_map(arg_prop, arg_layout) { - static_assert( - std::is_same::pointer_type>::value, - "Constructing View to wrap user memory must supply matching pointer " - "type"); - -#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK - if constexpr (std::is_same_v || - std::is_same_v || - std::is_same_v) { - size_t i0 = arg_layout.dimension[0]; - size_t i1 = arg_layout.dimension[1]; - size_t i2 = arg_layout.dimension[2]; - size_t i3 = arg_layout.dimension[3]; - size_t i4 = arg_layout.dimension[4]; - size_t i5 = arg_layout.dimension[5]; - size_t i6 = arg_layout.dimension[6]; - size_t i7 = arg_layout.dimension[7]; - - Impl::runtime_check_rank( - *this, std::is_same::value, i0, i1, - i2, i3, i4, i5, i6, i7, "UNMANAGED"); - } -#endif - } - - // Simple dimension-only layout - template - explicit inline View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - template - explicit KOKKOS_INLINE_FUNCTION View( - const Impl::ViewCtorProp& arg_prop, - std::enable_if_t::has_pointer, size_t> const - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(arg_prop, - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Allocate with label and layout - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, - typename traits::array_layout> const& arg_layout) - : View(Impl::ViewCtorProp(arg_label), arg_layout) {} - - // Allocate label and layout, must disambiguate from subview constructor. - template - explicit inline View( - const Label& arg_label, - std::enable_if_t::value, const size_t> - arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_label), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - // Construct view from ViewTracker and map - // This should be the preferred method because future extensions may need to - // use the ViewTracker class. - template - KOKKOS_INLINE_FUNCTION View( - const view_tracker_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track.m_tracker); - } - - // Construct View from internal shared allocation tracker object and map - // This is here for backwards compatibility for classes that derive from - // Kokkos::View - template - KOKKOS_INLINE_FUNCTION View( - const typename view_tracker_type::track_type& track, - const Kokkos::Impl::ViewMapping& map) - : m_track(track), m_map() { - using Mapping = - Kokkos::Impl::ViewMapping; - static_assert(Mapping::is_assignable, - "Incompatible View copy construction"); - Mapping::assign(m_map, map, track); - } - - //---------------------------------------- - // Memory span required to wrap these dimensions. - static constexpr size_t required_allocation_size( - typename traits::array_layout const& layout) { - return map_type::memory_span(layout); - } - - static constexpr size_t required_allocation_size( - const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, - const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, - const size_t arg_N6 = 0, const size_t arg_N7 = 0) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - return map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp(arg_ptr), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - explicit KOKKOS_INLINE_FUNCTION View( - pointer_type arg_ptr, const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} - - //---------------------------------------- - // Shared scratch memory constructor - - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, - const size_t arg_N1 = KOKKOS_INVALID_INDEX, - const size_t arg_N2 = KOKKOS_INVALID_INDEX, - const size_t arg_N3 = KOKKOS_INVALID_INDEX, - const size_t arg_N4 = KOKKOS_INVALID_INDEX, - const size_t arg_N5 = KOKKOS_INVALID_INDEX, - const size_t arg_N6 = KOKKOS_INVALID_INDEX, - const size_t arg_N7 = KOKKOS_INVALID_INDEX) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - const size_t num_passed_args = Impl::count_valid_integers( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); - - if (std::is_void::value && - num_passed_args != rank_dynamic) { - Kokkos::abort( - "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); - } - - return View::shmem_size(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); - } - - private: - // Want to be able to align to minimum scratch alignment or sizeof or alignof - // elements - static constexpr size_t scratch_value_alignment = - max({sizeof(typename traits::value_type), - alignof(typename traits::value_type), - static_cast( - traits::execution_space::scratch_memory_space::ALIGN)}); - - public: - static KOKKOS_INLINE_FUNCTION size_t - shmem_size(typename traits::array_layout const& arg_layout) { - return map_type::memory_span(arg_layout) + scratch_value_alignment; - } - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const typename traits::array_layout& arg_layout) - : View(Impl::ViewCtorProp(reinterpret_cast( - arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), - scratch_value_alignment))), - arg_layout) {} - - explicit KOKKOS_INLINE_FUNCTION View( - const typename traits::execution_space::scratch_memory_space& arg_space, - const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, - const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) - : View(Impl::ViewCtorProp( - reinterpret_cast(arg_space.get_shmem_aligned( - map_type::memory_span(typename traits::array_layout( - arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, - arg_N7)), - scratch_value_alignment))), - typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, - arg_N4, arg_N5, arg_N6, arg_N7)) { - static_assert(traits::array_layout::is_extent_constructible, - "Layout is not constructible from extent arguments. Use " - "overload taking a layout object instead."); - } - - //---------------------------------------- - // MDSpan converting constructors -#ifdef KOKKOS_ENABLE_IMPL_MDSPAN - template ::mdspan_type> - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(traits::is_managed) -#endif - View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, - std::enable_if_t< - !std::is_same_v>* = - nullptr) - : View(mds.data_handle(), - Impl::array_layout_from_mapping< - typename traits::array_layout, - typename Impl::MDSpanViewTraits::mdspan_type>( - mds.mapping())) { - } - - template - KOKKOS_INLINE_FUNCTION -#ifndef KOKKOS_ENABLE_CXX17 - explicit(!std::is_convertible_v< - Kokkos::mdspan, - typename Impl::MDSpanViewTraits::mdspan_type>) -#endif - View(const Kokkos::mdspan& mds) - : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { - } - - //---------------------------------------- - // Conversion to MDSpan - template ::mdspan_type, - typename = std::enable_if_t, - std::false_type, - std::is_assignable, - ImplNaturalMDSpanType>>::value>> - KOKKOS_INLINE_FUNCTION constexpr operator mdspan< - OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - return mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map)}; - } - - template >, - typename = std::enable_if_t>> - KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( - const OtherAccessorType& other_accessor = - typename Impl::MDSpanViewTraits::accessor_type()) { - using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; - using ret_mdspan_type = - mdspan; - return ret_mdspan_type{data(), - Impl::mapping_from_view_mapping(m_map), - other_accessor}; - } -#endif // KOKKOS_ENABLE_IMPL_MDSPAN -}; - -template -KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { - return View::rank(); -} - -namespace Impl { - -template -struct RankDataType { - using type = typename RankDataType::type*; -}; - -template -struct RankDataType { - using type = ValueType; -}; - -template -KOKKOS_FUNCTION std::enable_if_t< - N == View::rank() && - std::is_same::specialize, void>::value, - View> -as_view_of_rank_n(View v) { - return v; -} - -// Placeholder implementation to compile generic code for DynRankView; should -// never be called -template -KOKKOS_FUNCTION std::enable_if_t< - N != View::rank() && - std::is_same::specialize, void>::value, - View::value_type, N>::type, - Args...>> -as_view_of_rank_n(View) { - Kokkos::abort("Trying to get at a View of the wrong rank"); - return {}; -} - -template -void apply_to_view_of_static_rank(Function&& f, View a) { - f(a); -} - -} // namespace Impl -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Impl { -template -struct TypeListToViewTraits; - -template -struct TypeListToViewTraits> { - using type = ViewTraits; -}; - -// It is not safe to assume that subviews of views with the Aligned memory trait -// are also aligned. Hence, just remove that attribute for subviews. -template -struct RemoveAlignedMemoryTrait { - private: - using type_list_in = Kokkos::Impl::type_list; - using memory_traits = typename ViewTraits::memory_traits; - using type_list_in_wo_memory_traits = - typename Kokkos::Impl::type_list_remove_first::type; - using new_memory_traits = - Kokkos::MemoryTraits; - using new_type_list = typename Kokkos::Impl::concat_type_list< - type_list_in_wo_memory_traits, - Kokkos::Impl::type_list>::type; - - public: - using type = typename TypeListToViewTraits::type; -}; -} // namespace Impl - -template -KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -template -KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, - Args... args) { - static_assert(View::rank == sizeof...(Args), - "subview requires one argument for each source View rank"); - static_assert(Kokkos::is_memory_traits::value); - - return typename Kokkos::Impl::ViewMapping< - void /* deduce subview type from source view traits */ - , - typename Impl::RemoveAlignedMemoryTrait::type, - Args...>::type(src, args...); -} -#endif - -template -using Subview = decltype(subview(std::declval(), std::declval()...)); - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, - const View& rhs) { - // Same data, layout, dimensions - using lhs_traits = ViewTraits; - using rhs_traits = ViewTraits; - - return std::is_same::value && - std::is_same::value && - std::is_same::value && - View::rank() == View::rank() && - lhs.data() == rhs.data() && lhs.span() == rhs.span() && - lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && - lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && - lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && - lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); -} - -template -KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, - const View& rhs) { - return !(operator==(lhs, rhs)); -} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template -struct CommonViewValueType; - -template -struct CommonViewValueType { - using value_type = std::common_type_t; -}; - -template -struct CommonViewAllocProp; - -template -struct CommonViewAllocProp { - using value_type = ValueType; - using scalar_array_type = ValueType; - - template - KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} -}; - -template -struct DeduceCommonViewAllocProp; - -// Base case must provide types for: -// 1. specialize 2. value_type 3. is_view 4. prop_type -template -struct DeduceCommonViewAllocProp { - using specialize = typename FirstView::traits::specialize; - - using value_type = typename FirstView::traits::value_type; - - enum : bool { is_view = is_view::value }; - - using prop_type = CommonViewAllocProp; -}; - -template -struct DeduceCommonViewAllocProp { - using NextTraits = DeduceCommonViewAllocProp; - - using first_specialize = typename FirstView::traits::specialize; - using first_value_type = typename FirstView::traits::value_type; - - enum : bool { first_is_view = is_view::value }; - - using next_specialize = typename NextTraits::specialize; - using next_value_type = typename NextTraits::value_type; - - enum : bool { next_is_view = NextTraits::is_view }; - - // common types - - // determine specialize type - // if first and next specialize differ, but are not the same specialize, error - // out - static_assert(!(!std::is_same::value && - !std::is_void::value && - !std::is_void::value), - "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " - "specialize trait allowed"); - - // otherwise choose non-void specialize if either/both are non-void - using specialize = std::conditional_t< - std::is_same::value, first_specialize, - std::conditional_t<(std::is_void::value && - !std::is_void::value), - next_specialize, first_specialize>>; - - using value_type = typename CommonViewValueType::value_type; - - enum : bool { is_view = (first_is_view && next_is_view) }; - - using prop_type = CommonViewAllocProp; -}; - -} // end namespace Impl - -template -using DeducedCommonPropsType = - typename Impl::DeduceCommonViewAllocProp::prop_type; - -// This function is required in certain scenarios where users customize -// Kokkos View internals. One example are dynamic length embedded ensemble -// types. The function is used to propagate necessary information -// (like the ensemble size) when creating new views. -// However, most of the time it is called with a single view. -// Furthermore, the propagated information is not just for view allocations. -// From what I can tell, the type of functionality provided by -// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, -// a mechanism we will eventually use to replace this clunky approach here, when -// we are finally mdspan based. -// TODO: get rid of this when we have mdspan -template -KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( - Views const&... views) { - return DeducedCommonPropsType(views...); -} - -} // namespace Kokkos - -#include -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #ifndef KOKKOS_VIEW_HPP */ +#endif /* KOKKOS_VIEW_HPP */ diff --git a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp index efa56a086e..4d22634281 100644 --- a/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Kokkos_WorkGraphPolicy.hpp @@ -120,7 +120,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { (std::int32_t)BEGIN_TOKEN))) { // Attempt to claim ready work index succeeded, // update the hint and return work index - atomic_increment(begin_hint); + atomic_inc(begin_hint); return w; } // arrive here when ready_queue[i] == BEGIN_TOKEN @@ -169,7 +169,7 @@ class WorkGraphPolicy : public Kokkos::Impl::PolicyTraits { void operator()(const TagCount, int i) const noexcept { std::int32_t* const count_queue = &m_queue[m_graph.numRows()]; - atomic_increment(count_queue + m_graph.entries[i]); + atomic_inc(count_queue + m_graph.entries[i]); } KOKKOS_INLINE_FUNCTION diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp index 99daf379b6..37fcfb7a1d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.cpp @@ -23,7 +23,19 @@ #include #include +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) +#include +#elif defined(KOKKOS_ARCH_AMD_GPU) +// FIXME_OPENACC - hip_runtime_api.h contains two implementations: one for AMD +// GPUs and the other for NVIDIA GPUs; below macro is needed to choose AMD GPUs. +#define __HIP_PLATFORM_AMD__ +#include +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) +#include +#endif + #include +#include Kokkos::Experimental::OpenACC::OpenACC() : m_space_instance( @@ -46,6 +58,8 @@ Kokkos::Experimental::OpenACC::OpenACC(int async_arg) void Kokkos::Experimental::OpenACC::impl_initialize( InitializationSettings const& settings) { + Impl::OpenACCInternal::m_concurrency = + 256000; // FIXME_OPENACC - random guess when cannot compute if (Impl::OpenACC_Traits::may_fallback_to_host && acc_get_num_devices(Impl::OpenACC_Traits::dev_type) == 0 && !settings.has_device_id()) { @@ -59,11 +73,46 @@ void Kokkos::Experimental::OpenACC::impl_initialize( acc_get_device_num(acc_device_host); } else { using Kokkos::Impl::get_visible_devices; + acc_set_device_type(Impl::OpenACC_Traits::dev_type); std::vector const& visible_devices = get_visible_devices(); using Kokkos::Impl::get_gpu; int const dev_num = get_gpu(settings).value_or(visible_devices[0]); acc_set_device_num(dev_num, Impl::OpenACC_Traits::dev_type); Impl::OpenACCInternal::m_acc_device_num = dev_num; +#if defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU) + cudaDeviceProp deviceProp; + cudaError error = cudaGetDeviceProperties(&deviceProp, dev_num); + if (error != cudaSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "CUDA device properties: (" << cudaGetErrorName(error) + << "): " << cudaGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ARCH_AMD_GPU) + hipDeviceProp_t deviceProp; + hipError_t error = hipGetDeviceProperties(&deviceProp, dev_num); + if (error != hipSuccess) { + std::ostringstream msg; + msg << "Error: During OpenACC backend initialization, failed to retrieve " + << "HIP device properties: (" << hipGetErrorName(error) + << "): " << hipGetErrorString(error); + Kokkos::Impl::host_abort(msg.str().c_str()); + } + Impl::OpenACCInternal::m_concurrency = + deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + Impl::OpenACCInternal::m_concurrency = std::thread::hardware_concurrency(); + if (Impl::OpenACCInternal::m_concurrency == 0) { + Kokkos::Impl::host_abort( + "Error: During OpenACC backend initialization, failed to retrieve " + "CPU hardware concurrency"); + } +#else + // FIXME_OPENACC: Compute Impl::OpenACCInternal::m_concurrency correctly. +#endif } Impl::OpenACCInternal::singleton().initialize(); } @@ -86,6 +135,12 @@ void Kokkos::Experimental::OpenACC::print_configuration(std::ostream& os, os << "yes\n"; #else os << "no\n"; +#endif + os << " KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE: "; +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + os << "yes\n"; +#else + os << "no\n"; #endif m_space_instance->print_configuration(os, verbose); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp index 5155bee33d..aee696bd34 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC.hpp @@ -42,6 +42,7 @@ static_assert(false, // LLVM/Clacc compiler does not need this. #ifndef KOKKOS_COMPILER_CLANG #define KOKKOS_ENABLE_OPENACC_COLLAPSE_HIERARCHICAL_CONSTRUCTS +#define KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS #endif namespace Kokkos::Experimental::Impl { @@ -87,9 +88,9 @@ class OpenACC { static char const* name() { return "OpenACC"; } #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - static int concurrency() { return 256000; } // FIXME_OPENACC + static int concurrency(); #else - int concurrency() const { return 256000; } // FIXME_OPENACC + int concurrency() const; #endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 KOKKOS_DEPRECATED static bool in_parallel() { diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp index 4e7170cbbd..75cef98a8d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACCSpace.hpp @@ -85,16 +85,26 @@ class OpenACCSpace { template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; template <> struct Kokkos::Impl::MemorySpaceAccess { +#if defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + enum : bool{assignable = true}; + enum : bool{accessible = true}; +#else enum : bool { assignable = false }; enum : bool { accessible = false }; +#endif enum : bool { deepcopy = true }; }; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp index 82d38586eb..1373f8fa7a 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_FunctorAdapter.hpp @@ -38,7 +38,7 @@ class FunctorAdapter; \ KOKKOS_IMPL_ACC_PRAGMA(routine CLAUSE) \ template \ - KOKKOS_FUNCTION void operator()(Args &&... args) const { \ + KOKKOS_FUNCTION void operator()(Args &&...args) const { \ if constexpr (std::is_void_v) { \ m_functor(static_cast(args)...); \ } else { \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp index 10a76fbd31..1dad499c1b 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.cpp @@ -27,6 +27,7 @@ // Arbitrary value to denote that we don't know yet what device to use. int Kokkos::Experimental::Impl::OpenACCInternal::m_acc_device_num = -1; +int Kokkos::Experimental::Impl::OpenACCInternal::m_concurrency = -1; Kokkos::Experimental::Impl::OpenACCInternal& Kokkos::Experimental::Impl::OpenACCInternal::singleton() { @@ -78,8 +79,18 @@ void Kokkos::Experimental::Impl::OpenACCInternal::fence( [&]() { acc_wait(m_async_arg); }); } -uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() const - noexcept { +uint32_t Kokkos::Experimental::Impl::OpenACCInternal::instance_id() + const noexcept { return Kokkos::Tools::Experimental::Impl::idForInstance( reinterpret_cast(this)); } + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +int Kokkos::Experimental::OpenACC::concurrency() { + return Impl::OpenACCInternal::m_concurrency; +} +#else +int Kokkos::Experimental::OpenACC::concurrency() const { + return Impl::OpenACCInternal::m_concurrency; +} +#endif diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp index c3d7236872..343d9921a9 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Instance.hpp @@ -30,11 +30,12 @@ namespace Kokkos::Experimental::Impl { class OpenACCInternal { bool m_is_initialized = false; - OpenACCInternal(const OpenACCInternal&) = default; + OpenACCInternal(const OpenACCInternal&) = default; OpenACCInternal& operator=(const OpenACCInternal&) = default; public: static int m_acc_device_num; + static int m_concurrency; int m_async_arg = acc_async_noval; OpenACCInternal() = default; diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp index 550436fe7b..629d26928e 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelFor_MDRange.hpp @@ -30,10 +30,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i1 = m / dim0 + begin1; + auto i0 = m % dim0 + begin0; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -42,6 +55,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, functor(i0, i1); } } +#endif } template @@ -50,10 +64,23 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<2> const& begin, OpenACCMDRangeEnd<2> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto i0 = m / dim1 + begin0; + auto i1 = m % dim1 + begin1; + functor(i0, i1); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(2) copyin(functor) async(async_arg) // clang-format on @@ -62,6 +89,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, functor(i0, i1); } } +#endif } template @@ -71,12 +99,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1) copyin(functor) async(async_arg) // clang-format on @@ -94,12 +122,12 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<2> const& end, OpenACCMDRangeTile<2> const& tile, int async_arg) { - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; // clang-format off #pragma acc parallel loop gang vector tile(tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -116,12 +144,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim1 * dim0; + auto i2 = m / tmp1 + begin2; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -132,6 +177,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -140,12 +186,29 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<3> const& begin, OpenACCMDRangeEnd<3> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + auto i1 = tmp2 / dim2 + begin1; + auto i2 = tmp2 % dim2 + begin2; + functor(i0, i1, i2); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(3) copyin(functor) async(async_arg) // clang-format on @@ -156,6 +219,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -165,15 +229,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2) copyin(functor) async(async_arg) // clang-format on @@ -193,15 +257,15 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<3> const& end, OpenACCMDRangeTile<3> const& tile, int async_arg) { - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; // clang-format off #pragma acc parallel loop gang vector tile(tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -220,14 +284,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim2 * dim1 * dim0; + auto i3 = m / tmp1 + begin3; + auto tmp2 = m % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -240,6 +325,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -248,14 +334,35 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<4> const& begin, OpenACCMDRangeEnd<4> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + auto i2 = tmp2 / dim3 + begin2; + auto i3 = tmp2 % dim3 + begin3; + functor(i0, i1, i2, i3); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(4) copyin(functor) async(async_arg) // clang-format on @@ -268,6 +375,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -277,18 +385,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3) copyin(functor) async(async_arg) // clang-format on @@ -310,18 +418,18 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<4> const& end, OpenACCMDRangeTile<4> const& tile, int async_arg) { - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; // clang-format off #pragma acc parallel loop gang vector tile(tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -342,16 +450,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = m / tmp1 + begin4; + auto tmp2 = m % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -366,6 +499,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -374,16 +508,41 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<5> const& begin, OpenACCMDRangeEnd<5> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i3 = tmp2 / dim4 + begin3; + auto i4 = tmp2 % dim4 + begin4; + functor(i0, i1, i2, i3, i4); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(5) copyin(functor) async(async_arg) // clang-format on @@ -398,6 +557,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -407,21 +567,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4) copyin(functor) async(async_arg) // clang-format on @@ -445,21 +605,21 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<5> const& end, OpenACCMDRangeTile<5> const& tile, int async_arg) { - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; // clang-format off #pragma acc parallel loop gang vector tile(tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on @@ -482,18 +642,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; + auto i5 = m / tmp1 + begin5; + auto tmp2 = m % tmp1; + tmp1 = dim3 * dim2 * dim1 * dim0; + auto i4 = tmp2 / tmp1 + begin4; + tmp2 = tmp2 % tmp1; + tmp1 = dim2 * dim1 * dim0; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + tmp1 = dim1 * dim0; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + auto i1 = tmp2 / dim0 + begin1; + auto i0 = tmp2 % dim0 + begin0; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -510,6 +699,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateLeft, } } } +#endif } template @@ -518,18 +708,47 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, OpenACCMDRangeBegin<6> const& begin, OpenACCMDRangeEnd<6> const& end, int async_arg) { - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; +#if defined(KOKKOS_ENABLE_OPENACC_COLLAPSE_MDRANGE_LOOPS) + auto dim5 = end5 - begin5; + auto dim4 = end4 - begin4; + auto dim3 = end3 - begin3; + auto dim2 = end2 - begin2; + auto dim1 = end1 - begin1; + auto dim0 = end0 - begin0; + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; +// clang-format off +#pragma acc parallel loop gang vector copyin(functor) async(async_arg) + // clang-format on + for (decltype(nIter) m = 0; m < nIter; ++m) { + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; + auto i0 = m / tmp1 + begin0; + auto tmp2 = m % tmp1; + tmp1 = dim5 * dim4 * dim3 * dim2; + auto i1 = tmp2 / tmp1 + begin1; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4 * dim3; + auto i2 = tmp2 / tmp1 + begin2; + tmp2 = tmp2 % tmp1; + tmp1 = dim5 * dim4; + auto i3 = tmp2 / tmp1 + begin3; + tmp2 = tmp2 % tmp1; + auto i4 = tmp2 / dim5 + begin4; + auto i5 = tmp2 % dim5 + begin5; + functor(i0, i1, i2, i3, i4, i5); + } +#else // clang-format off #pragma acc parallel loop gang vector collapse(6) copyin(functor) async(async_arg) // clang-format on @@ -546,6 +765,7 @@ void OpenACCParallelForMDRangePolicy(OpenACCCollapse, OpenACCIterateRight, } } } +#endif } template @@ -555,24 +775,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateLeft, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile0 = tile[0]; - int tile1 = tile[1]; - int tile2 = tile[2]; - int tile3 = tile[3]; - int tile4 = tile[4]; - int tile5 = tile[5]; - int begin5 = begin[5]; - int end5 = end[5]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin0 = begin[0]; - int end0 = end[0]; + auto tile0 = tile[0]; + auto tile1 = tile[1]; + auto tile2 = tile[2]; + auto tile3 = tile[3]; + auto tile4 = tile[4]; + auto tile5 = tile[5]; + auto begin5 = begin[5]; + auto end5 = end[5]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin0 = begin[0]; + auto end0 = end[0]; // clang-format off #pragma acc parallel loop gang vector tile(tile0,tile1,tile2,tile3,tile4,tile5) copyin(functor) async(async_arg) // clang-format on @@ -598,24 +818,24 @@ void OpenACCParallelForMDRangePolicy(OpenACCTile, OpenACCIterateRight, OpenACCMDRangeEnd<6> const& end, OpenACCMDRangeTile<6> const& tile, int async_arg) { - int tile5 = tile[5]; - int tile4 = tile[4]; - int tile3 = tile[3]; - int tile2 = tile[2]; - int tile1 = tile[1]; - int tile0 = tile[0]; - int begin0 = begin[0]; - int end0 = end[0]; - int begin1 = begin[1]; - int end1 = end[1]; - int begin2 = begin[2]; - int end2 = end[2]; - int begin3 = begin[3]; - int end3 = end[3]; - int begin4 = begin[4]; - int end4 = end[4]; - int begin5 = begin[5]; - int end5 = end[5]; + auto tile5 = tile[5]; + auto tile4 = tile[4]; + auto tile3 = tile[3]; + auto tile2 = tile[2]; + auto tile1 = tile[1]; + auto tile0 = tile[0]; + auto begin0 = begin[0]; + auto end0 = end[0]; + auto begin1 = begin[1]; + auto end1 = end[1]; + auto begin2 = begin[2]; + auto end2 = end[2]; + auto begin3 = begin[3]; + auto end3 = end[3]; + auto begin4 = begin[4]; + auto end4 = end[4]; + auto begin5 = begin[5]; + auto end5 = end[5]; // clang-format off #pragma acc parallel loop gang vector tile(tile5,tile4,tile3,tile2,tile1,tile0) copyin(functor) async(async_arg) // clang-format on diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp index 5afb5e75d3..2b5631d6f8 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_MDRange.hpp @@ -113,6 +113,404 @@ class Kokkos::Impl::ParallelReduce \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i1 = m / dim0 + begin1; \ + auto i0 = m % dim0 + begin0; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<2> const& begin, \ + OpenACCMDRangeEnd<2> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto i0 = m / dim1 + begin0; \ + auto i1 = m % dim1 + begin1; \ + functor(i0, i1, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim1 * dim0; \ + auto i2 = m / tmp1 + begin2; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<3> const& begin, \ + OpenACCMDRangeEnd<3> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + auto i1 = tmp2 / dim2 + begin1; \ + auto i2 = tmp2 % dim2 + begin2; \ + functor(i0, i1, i2, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim2 * dim1 * dim0; \ + auto i3 = m / tmp1 + begin3; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<4> const& begin, \ + OpenACCMDRangeEnd<4> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + auto i2 = tmp2 / dim3 + begin2; \ + auto i3 = tmp2 % dim3 + begin3; \ + functor(i0, i1, i2, i3, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = m / tmp1 + begin4; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<5> const& begin, \ + OpenACCMDRangeEnd<5> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i3 = tmp2 / dim4 + begin3; \ + auto i4 = tmp2 % dim4 + begin4; \ + functor(i0, i1, i2, i3, i4, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateLeft, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim4 * dim3 * dim2 * dim1 * dim0; \ + auto i5 = m / tmp1 + begin5; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim3 * dim2 * dim1 * dim0; \ + auto i4 = tmp2 / tmp1 + begin4; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim2 * dim1 * dim0; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim1 * dim0; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + auto i1 = tmp2 / dim0 + begin1; \ + auto i0 = tmp2 % dim0 + begin0; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + \ + template \ + void OpenACCParallelReduce##REDUCER(OpenACCIterateRight, ValueType& aval, \ + Functor const& afunctor, \ + OpenACCMDRangeBegin<6> const& begin, \ + OpenACCMDRangeEnd<6> const& end, \ + int async_arg) { \ + auto val = aval; \ + auto const functor(afunctor); \ + auto begin0 = begin[0]; \ + auto end0 = end[0]; \ + auto begin1 = begin[1]; \ + auto end1 = end[1]; \ + auto begin2 = begin[2]; \ + auto end2 = end[2]; \ + auto begin3 = begin[3]; \ + auto end3 = end[3]; \ + auto begin4 = begin[4]; \ + auto end4 = end[4]; \ + auto begin5 = begin[5]; \ + auto end5 = end[5]; \ + auto dim5 = end5 - begin5; \ + auto dim4 = end4 - begin4; \ + auto dim3 = end3 - begin3; \ + auto dim2 = end2 - begin2; \ + auto dim1 = end1 - begin1; \ + auto dim0 = end0 - begin0; \ + auto nIter = dim5 * dim4 * dim3 * dim2 * dim1 * dim0; \ + /* clang-format off */ \ + KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector reduction(OPERATOR:val) copyin(functor) async(async_arg)) \ + /* clang-format on */ \ + for (decltype(nIter) m = 0; m < nIter; ++m) { \ + auto tmp1 = dim5 * dim4 * dim3 * dim2 * dim1; \ + auto i0 = m / tmp1 + begin0; \ + auto tmp2 = m % tmp1; \ + tmp1 = dim5 * dim4 * dim3 * dim2; \ + auto i1 = tmp2 / tmp1 + begin1; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4 * dim3; \ + auto i2 = tmp2 / tmp1 + begin2; \ + tmp2 = tmp2 % tmp1; \ + tmp1 = dim5 * dim4; \ + auto i3 = tmp2 / tmp1 + begin3; \ + tmp2 = tmp2 % tmp1; \ + auto i4 = tmp2 / dim5 + begin4; \ + auto i5 = tmp2 % dim5 + begin5; \ + functor(i0, i1, i2, i3, i4, i5, val); \ + } \ + acc_wait(async_arg); \ + aval = val; \ + } \ + } // namespace Kokkos::Experimental::Impl + +#else + #define KOKKOS_IMPL_OPENACC_PARALLEL_REDUCE_DISPATCH_ITERATE(REDUCER, \ OPERATOR) \ namespace Kokkos::Experimental::Impl { \ @@ -124,10 +522,10 @@ class Kokkos::Impl::ParallelReduce \ diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp index 430bdcb680..d4cb73164d 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelReduce_Team.hpp @@ -163,13 +163,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + wrapped_reducer.final(&tmp); result = tmp; } } @@ -180,15 +191,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() / loop_boundaries.team.vector_length(); if (j_start == 0) { #pragma acc loop seq for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -200,7 +221,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -208,6 +239,7 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -218,9 +250,17 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - using ValueType = typename ReducerType::value_type; - ValueType tmp; - reducer.init(tmp); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -228,6 +268,8 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); reducer.reference() = tmp; } } @@ -239,7 +281,17 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + iType j_start = loop_boundaries.team.team_rank() % loop_boundaries.team.vector_length(); if (j_start == 0) { @@ -247,6 +299,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } } @@ -273,10 +326,23 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + ValueType tmp = ValueType(); #pragma acc loop worker reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) lambda(i, tmp); + + wrapped_reducer.final(&tmp); result = tmp; } @@ -314,11 +380,22 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::OpenACCTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + wrapped_reducer.final(&tmp); result = tmp; } @@ -357,11 +434,23 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( const Impl::TeamVectorRangeBoundariesStruct& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType tmp = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type tmp; + wrapped_reducer.init(&tmp); + #pragma acc loop vector reduction(+ : tmp) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, tmp); } + + wrapped_reducer.final(&tmp); result = tmp; } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp index c6d3267bdb..b1c48baa1e 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_ParallelScan_Range.hpp @@ -225,7 +225,7 @@ KOKKOS_IMPL_ACC_PRAGMA(parallel loop gang vector_length(chunk_size) KOKKOS_IMPL_ } #pragma acc exit data delete (functor, chunk_values, offset_values, \ - final_reducer)async(async_arg) + final_reducer)async(async_arg) acc_wait(async_arg); } diff --git a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp index faa50aa7c3..95526aa784 100644 --- a/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp +++ b/lib/kokkos/core/src/OpenACC/Kokkos_OpenACC_Traits.hpp @@ -28,8 +28,11 @@ struct OpenACC_Traits { #elif defined(KOKKOS_ARCH_AMD_GPU) static constexpr acc_device_t dev_type = acc_device_radeon; static constexpr bool may_fallback_to_host = false; +#elif defined(KOKKOS_ENABLE_OPENACC_FORCE_HOST_AS_DEVICE) + static constexpr acc_device_t dev_type = acc_device_host; + static constexpr bool may_fallback_to_host = true; #else - static constexpr acc_device_t dev_type = acc_device_not_host; + static constexpr acc_device_t dev_type = acc_device_default; static constexpr bool may_fallback_to_host = true; #endif }; diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp index a403909f67..aa4be87ceb 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP.hpp @@ -30,7 +30,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -93,11 +92,16 @@ class OpenMP { void fence(std::string const& name = "Kokkos::OpenMP::fence: Unnamed Instance Fence") const; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 /// \brief Does the given instance return immediately after launching /// a parallel algorithm /// /// This always returns false on OpenMP - inline static bool is_asynchronous(OpenMP const& = OpenMP()) noexcept; + KOKKOS_DEPRECATED inline static bool is_asynchronous( + OpenMP const& = OpenMP()) noexcept { + return false; + } +#endif #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 static int concurrency(OpenMP const& = OpenMP()); @@ -154,10 +158,6 @@ inline int OpenMP::impl_thread_pool_rank() noexcept { KOKKOS_IF_ON_DEVICE((return -1;)) } -inline bool OpenMP::is_asynchronous(OpenMP const& /*instance*/) noexcept { - return false; -} - inline int OpenMP::impl_thread_pool_size(int depth) const { return depth < 2 ? impl_thread_pool_size() : 1; } @@ -202,7 +202,9 @@ struct MemorySpaceAccess #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp index 2877d940fa..6edcbff0c2 100644 --- a/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp +++ b/lib/kokkos/core/src/OpenMP/Kokkos_OpenMP_Task.hpp @@ -26,12 +26,19 @@ #include #include +#include + #include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -360,6 +367,10 @@ extern template class TaskQueue #include #include -#include #include #include #include @@ -148,7 +147,6 @@ struct DeviceTypeTraits<::Kokkos::Experimental::OpenMPTarget> { #include #include #include -#include /*--------------------------------------------------------------------------*/ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp index ed625cfcc8..ec33d25b96 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTargetSpace.hpp @@ -28,6 +28,7 @@ static_assert(false, #include #include +#include #ifdef KOKKOS_ENABLE_OPENMPTARGET @@ -91,9 +92,9 @@ class OpenMPTargetSpace { /**\brief Default memory space instance */ OpenMPTargetSpace(); - OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; - OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; - OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; + OpenMPTargetSpace(OpenMPTargetSpace&& rhs) = default; + OpenMPTargetSpace(const OpenMPTargetSpace& rhs) = default; + OpenMPTargetSpace& operator=(OpenMPTargetSpace&&) = default; OpenMPTargetSpace& operator=(const OpenMPTargetSpace&) = default; ~OpenMPTargetSpace() = default; @@ -141,79 +142,5 @@ class OpenMPTargetSpace { KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( Kokkos::Experimental::OpenMPTargetSpace); -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -// TODO: implement all possible deep_copies -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - // In the Release and RelWithDebInfo builds, the size of the memcpy should - // be greater than zero to avoid error. omp_target_memcpy returns zero on - // success. - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence " - "before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_default_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_default_device(), - omp_get_initial_device())); - } -}; - -template -struct DeepCopy { - DeepCopy(void* dst, const void* src, size_t n) { - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } - DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { - exec.fence( - "Kokkos::Impl::DeepCopy: fence before " - "copy"); - if (n > 0) - KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( - dst, const_cast(src), n, 0, 0, omp_get_initial_device(), - omp_get_default_device())); - } -}; - -} // namespace Impl -} // namespace Kokkos - #endif #endif /* #define KOKKOS_OPENMPTARGETSPACE_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp new file mode 100644 index 0000000000..aace09e266 --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_DeepCopy.hpp @@ -0,0 +1,101 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_OPENMPTARGET_DEEP_COPY_HPP +#define KOKKOS_OPENMPTARGET_DEEP_COPY_HPP + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +// TODO: implement all possible deep_copies +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + // In the Release and RelWithDebInfo builds, the size of the memcpy should + // be greater than zero to avoid error. omp_target_memcpy returns zero on + // success. + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence " + "before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_default_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_default_device(), + omp_get_initial_device())); + } +}; + +template +struct DeepCopy { + DeepCopy(void* dst, const void* src, size_t n) { + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } + DeepCopy(const ExecutionSpace& exec, void* dst, const void* src, size_t n) { + exec.fence( + "Kokkos::Impl::DeepCopy: fence before " + "copy"); + if (n > 0) + KOKKOS_IMPL_OMPT_SAFE_CALL(omp_target_memcpy( + dst, const_cast(src), n, 0, 0, omp_get_initial_device(), + omp_get_default_device())); + } +}; + +} // namespace Impl +} // namespace Kokkos + +#endif // KOKKOS_OPENMPTARGET_DEEP_COPY_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp deleted file mode 100644 index 6c5eb048e3..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Exec.cpp +++ /dev/null @@ -1,130 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#ifdef KOKKOS_ENABLE_OPENMPTARGET - -// FIXME_OPENMPTARGET currently unused -/* -namespace Kokkos { -namespace Impl { -namespace { - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel(); - -KOKKOS_INLINE_FUNCTION -int kokkos_omp_in_parallel() { return omp_in_parallel(); } - -bool s_using_hwloc = false; - -} // namespace -} // namespace Impl -} // namespace Kokkos -*/ - -namespace Kokkos { -namespace Impl { - -void OpenMPTargetExec::verify_is_process(const char* const label) { - // Fails if the current task is in a parallel region or is not on the host. - if (omp_in_parallel() && (!omp_is_initial_device())) { - std::string msg(label); - msg.append(" ERROR: in parallel or on device"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void OpenMPTargetExec::verify_initialized(const char* const label) { - if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { - std::string msg(label); - msg.append(" ERROR: not initialized"); - Kokkos::Impl::throw_runtime_exception(msg); - } -} - -void* OpenMPTargetExec::m_scratch_ptr = nullptr; -int64_t OpenMPTargetExec::m_scratch_size = 0; -uint32_t* OpenMPTargetExec::m_uniquetoken_ptr = nullptr; -int OpenMPTargetExec::MAX_ACTIVE_THREADS = 0; -std::mutex OpenMPTargetExec::m_mutex_scratch_ptr; - -void OpenMPTargetExec::clear_scratch() { - Kokkos::Experimental::OpenMPTargetSpace space; - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_ptr = nullptr; - m_scratch_size = 0; -} - -void* OpenMPTargetExec::get_scratch_ptr() { return m_scratch_ptr; } - -void OpenMPTargetExec::resize_scratch(int64_t team_size, int64_t shmem_size_L0, - int64_t shmem_size_L1, - int64_t league_size) { - Kokkos::Experimental::OpenMPTargetSpace space; - // Level-0 scratch when using clang/17 and higher comes from their OpenMP - // extension, `ompx_dyn_cgroup_mem`. -#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) - shmem_size_L0 = 0; -#endif - const int64_t shmem_size = - shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. - const int64_t padding = shmem_size * 10 / 100; // Padding per team. - - // Maximum active teams possible. - // The number should not exceed the maximum in-flight teams possible or the - // league_size. - int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); - - // max_active_teams is the number of active teams on the given hardware. - // We set the number of teams to be twice the number of max_active_teams for - // the compiler to pick the right number in its case. - // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. -#if !defined(KOKKOS_COMPILER_CRAY_LLVM) - omp_set_num_teams(max_active_teams * 2); -#endif - - // Total amount of scratch memory allocated is depenedent - // on the maximum number of in-flight teams possible. - int64_t total_size = - (shmem_size + OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * - max_active_teams * 2; - - if (total_size > m_scratch_size) { - space.deallocate(m_scratch_ptr, m_scratch_size); - m_scratch_size = total_size; - m_scratch_ptr = space.allocate(total_size); - } -} - -} // namespace Impl -} // namespace Kokkos - -#endif // KOKKOS_ENABLE_OPENMPTARGET diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp new file mode 100644 index 0000000000..13b509c0ad --- /dev/null +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_FunctorAdapter.hpp @@ -0,0 +1,48 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP +#define KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP + +#include +#include + +namespace Kokkos::Experimental::Impl { + +template +class FunctorAdapter { + Functor m_functor; + using WorkTag = typename Policy::work_tag; + + public: + FunctorAdapter() = default; + FunctorAdapter(Functor const &functor) : m_functor(functor) {} + + Functor get_functor() const { return m_functor; } + + template + KOKKOS_FUNCTION void operator()(Args &&...args) const { + if constexpr (std::is_void_v) { + m_functor(static_cast(args)...); + } else { + m_functor(WorkTag(), static_cast(args)...); + } + } +}; + +} // namespace Kokkos::Experimental::Impl + +#endif // KOKKOS_OPENMPTARGET_FUNCTOR_ADAPTER_HPP diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp index 44e9119ea8..53e723882f 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Instance.cpp @@ -27,11 +27,11 @@ // constructor. undef'ed at the end #define KOKKOS_IMPL_OPENMPTARGET_WORKAROUND +#include #include #include #include #include -#include #include @@ -105,18 +105,15 @@ void OpenMPTargetInternal::print_configuration(std::ostream& os, void OpenMPTargetInternal::impl_finalize() { m_is_initialized = false; - Kokkos::Impl::OpenMPTargetExec space; - if (space.m_uniquetoken_ptr != nullptr) + if (m_uniquetoken_ptr != nullptr) Kokkos::kokkos_free( - space.m_uniquetoken_ptr); + m_uniquetoken_ptr); } void OpenMPTargetInternal::impl_initialize() { m_is_initialized = true; - Kokkos::Impl::OpenMPTargetExec::MAX_ACTIVE_THREADS = concurrency(); - // FIXME_OPENMPTARGET: Only fix the number of teams for NVIDIA architectures // from Pascal and upwards. // FIXME_OPENMPTARGTE: Cray compiler did not yet implement omp_set_num_teams. @@ -136,7 +133,75 @@ OpenMPTargetInternal* OpenMPTargetInternal::impl_singleton() { return &self; } -} // Namespace Impl +void OpenMPTargetInternal::verify_is_process(const char* const label) { + // Fails if the current task is in a parallel region or is not on the host. + if (omp_in_parallel() && (!omp_is_initial_device())) { + std::string msg(label); + msg.append(" ERROR: in parallel or on device"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::verify_initialized(const char* const label) { + if (0 == Kokkos::Experimental::OpenMPTarget().impl_is_initialized()) { + std::string msg(label); + msg.append(" ERROR: not initialized"); + Kokkos::Impl::throw_runtime_exception(msg); + } +} + +void OpenMPTargetInternal::clear_scratch() { + Kokkos::Experimental::OpenMPTargetSpace space; + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_ptr = nullptr; + m_scratch_size = 0; +} + +void* OpenMPTargetInternal::get_scratch_ptr() { return m_scratch_ptr; } + +void OpenMPTargetInternal::resize_scratch(int64_t team_size, + int64_t shmem_size_L0, + int64_t shmem_size_L1, + int64_t league_size) { + Kokkos::Experimental::OpenMPTargetSpace space; + // Level-0 scratch when using clang/17 and higher comes from their OpenMP + // extension, `ompx_dyn_cgroup_mem`. +#if defined(KOKKOS_IMPL_OPENMPTARGET_LLVM_EXTENSIONS) + shmem_size_L0 = 0; +#endif + const int64_t shmem_size = + shmem_size_L0 + shmem_size_L1; // L0 + L1 scratch memory per team. + const int64_t padding = shmem_size * 10 / 100; // Padding per team. + + // Maximum active teams possible. + // The number should not exceed the maximum in-flight teams possible or the + // league_size. + int max_active_teams = + std::min(OpenMPTargetInternal::concurrency() / team_size, league_size); + + // max_active_teams is the number of active teams on the given hardware. + // We set the number of teams to be twice the number of max_active_teams for + // the compiler to pick the right number in its case. + // FIXME_OPENMPTARGET: Cray compiler did not yet implement omp_set_num_teams. +#if !defined(KOKKOS_COMPILER_CRAY_LLVM) + omp_set_num_teams(max_active_teams * 2); +#endif + + // Total amount of scratch memory allocated is depenedent + // on the maximum number of in-flight teams possible. + int64_t total_size = + (shmem_size + + ::Kokkos::Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE + padding) * + max_active_teams * 2; + + if (total_size > m_scratch_size) { + space.deallocate(m_scratch_ptr, m_scratch_size); + m_scratch_size = total_size; + m_scratch_ptr = space.allocate(total_size); + } +} + +} // namespace Impl OpenMPTarget::OpenMPTarget() : m_space_instance(Impl::OpenMPTargetInternal::impl_singleton()) {} @@ -206,9 +271,9 @@ namespace Experimental { UniqueToken:: - UniqueToken(Kokkos::Experimental::OpenMPTarget const&) { + UniqueToken(Kokkos::Experimental::OpenMPTarget const& space) { #ifdef KOKKOS_IMPL_OPENMPTARGET_WORKAROUND - uint32_t* ptr = Kokkos::Impl::OpenMPTargetExec::m_uniquetoken_ptr; + uint32_t* ptr = space.impl_internal_space_instance()->m_uniquetoken_ptr; int count = Kokkos::Experimental::OpenMPTarget().concurrency(); if (ptr == nullptr) { int size = count * sizeof(uint32_t); @@ -221,7 +286,7 @@ UniqueTokenm_uniquetoken_ptr = ptr; } #else // FIXME_OPENMPTARGET - 2 versions of non-working implementations to fill `ptr` @@ -229,8 +294,7 @@ UniqueToken - namespace Kokkos { namespace Experimental { namespace Impl { @@ -27,9 +25,9 @@ enum class openmp_fence_is_static { yes, no }; class OpenMPTargetInternal { private: - OpenMPTargetInternal() = default; - OpenMPTargetInternal(const OpenMPTargetInternal&) = default; - OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = default; + OpenMPTargetInternal() = default; + OpenMPTargetInternal(const OpenMPTargetInternal&) = delete; + OpenMPTargetInternal& operator=(const OpenMPTargetInternal&) = delete; public: void fence(openmp_fence_is_static is_static = openmp_fence_is_static::no); @@ -55,6 +53,19 @@ class OpenMPTargetInternal { static OpenMPTargetInternal* impl_singleton(); + static void verify_is_process(const char* const); + static void verify_initialized(const char* const); + + void* get_scratch_ptr(); + void clear_scratch(); + void resize_scratch(int64_t team_reduce_bytes, int64_t team_shared_bytes, + int64_t thread_local_bytes, int64_t league_size); + + void* m_scratch_ptr = nullptr; + std::mutex m_mutex_scratch_ptr; + int64_t m_scratch_size = 0; + uint32_t* m_uniquetoken_ptr = nullptr; + private: bool m_is_initialized = false; uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp index e222d65250..f71f888713 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Parallel.hpp @@ -116,8 +116,8 @@ class OpenMPTargetExecTeamMember { // FIXME_OPENMPTARGET this function currently ignores the reducer passed. template KOKKOS_INLINE_FUNCTION std::enable_if_t::value> - team_reduce(ReducerType const&, typename ReducerType::value_type& value) const - noexcept { + team_reduce(ReducerType const&, + typename ReducerType::value_type& value) const noexcept { #pragma omp barrier using value_type = typename ReducerType::value_type; @@ -741,43 +741,6 @@ struct TeamVectorRangeBoundariesStruct { } // namespace Impl -} // namespace Kokkos -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- -namespace Kokkos { -namespace Impl { - -//---------------------------------------------------------------------------- -/** \brief Data for OpenMPTarget thread execution */ - -class OpenMPTargetExec { - public: - // FIXME_OPENMPTARGET - Currently the maximum number of - // teams possible is calculated based on NVIDIA's Volta GPU. In - // future this value should be based on the chosen architecture for the - // OpenMPTarget backend. - static int MAX_ACTIVE_THREADS; - - private: - static void* scratch_ptr; - - public: - static void verify_is_process(const char* const); - static void verify_initialized(const char* const); - - static void* get_scratch_ptr(); - static void clear_scratch(); - static void resize_scratch(int64_t team_reduce_bytes, - int64_t team_shared_bytes, - int64_t thread_local_bytes, int64_t league_size); - - static void* m_scratch_ptr; - static std::mutex m_mutex_scratch_ptr; - static int64_t m_scratch_size; - static uint32_t* m_uniquetoken_ptr; -}; - -} // namespace Impl } // namespace Kokkos #endif /* KOKKOS_OPENMPTARGET_PARALLEL_HPP */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp index bd7d3eef5d..38ed7c5681 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_MDRange.hpp @@ -20,6 +20,8 @@ #include #include #include "Kokkos_OpenMPTarget_MDRangePolicy.hpp" +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- @@ -31,38 +33,38 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::MDRangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; - using Index = typename Policy::index_type; + using Policy = Kokkos::MDRangePolicy; + using Member = typename Policy::member_type; + using Index = typename Policy::index_type; + + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + const FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; public: inline void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - FunctorType functor(m_functor); + Policy policy = m_policy; - typename Policy::point_type unused; static_assert(1 < Policy::rank && Policy::rank < 7); static_assert(Policy::inner_direction == Iterate::Left || Policy::inner_direction == Iterate::Right); execute_tile( - unused, functor, policy, + m_functor, policy, std::integral_constant()); } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -72,18 +74,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i0 = begin_0; i0 < end_0; ++i0) for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -96,10 +94,7 @@ class ParallelFor, for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -107,9 +102,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -125,10 +119,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -137,9 +128,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -158,11 +148,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -172,9 +158,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateRight) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -197,12 +182,7 @@ class ParallelFor, for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } @@ -214,9 +194,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -226,18 +205,14 @@ class ParallelFor, #pragma omp target teams distribute parallel for collapse(2) map(to : functor) for (auto i1 = begin_1; i1 < end_1; ++i1) for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1); - else - functor(typename Policy::work_tag(), i0, i1); + functor(i0, i1); } } template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -250,10 +225,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2); - else - functor(typename Policy::work_tag(), i0, i1, i2); + functor(i0, i1, i2); } } } @@ -261,9 +233,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -279,10 +250,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, i3); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3); + functor(i0, i1, i2, i3); } } } @@ -291,9 +259,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -312,11 +279,7 @@ class ParallelFor, for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4); + functor(i0, i1, i2, i3, i4); } } } @@ -326,9 +289,8 @@ class ParallelFor, template inline std::enable_if_t execute_tile( - typename Policy::point_type offset, const FunctorType& functor, - const Policy& policy, OpenMPTargetIterateLeft) const { - (void)offset; + const FunctorAdapter& functor, const Policy& policy, + OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; const Index begin_2 = policy.m_lower[2]; @@ -351,12 +313,7 @@ class ParallelFor, for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - i5); + functor(i0, i1, i2, i3, i4, i5); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp index a674637a3b..502461cc5e 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Range.hpp @@ -20,6 +20,8 @@ #include #include #include +#include "Kokkos_OpenMPTarget_Instance.hpp" +#include "Kokkos_OpenMPTarget_FunctorAdapter.hpp" namespace Kokkos { namespace Impl { @@ -28,36 +30,30 @@ template class ParallelFor, Kokkos::Experimental::OpenMPTarget> { private: - using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Policy = Kokkos::RangePolicy; + using Member = typename Policy::member_type; - const FunctorType m_functor; + Kokkos::Experimental::Impl::FunctorAdapter m_functor; const Policy m_policy; public: - void execute() const { execute_impl(); } + void execute() const { execute_impl(); } - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto begin = m_policy.begin(); const auto end = m_policy.end(); if (end <= begin) return; - FunctorType a_functor(m_functor); + auto const a_functor(m_functor); #pragma omp target teams distribute parallel for map(to : a_functor) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void::value) { - a_functor(i); - } else { - a_functor(TagType(), i); - } + a_functor(i); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp index 26085f1140..77dc71a87b 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelFor_Team.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace Kokkos { @@ -76,28 +77,27 @@ class ParallelFor, using Policy = Kokkos::Impl::TeamPolicyInternal; - using WorkTag = typename Policy::work_tag; - using Member = typename Policy::member_type; + using Member = typename Policy::member_type; + + Kokkos::Experimental::Impl::FunctorAdapter m_functor; - const FunctorType m_functor; const Policy m_policy; const size_t m_shmem_size; public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); - execute_impl(); + execute_impl(); } private: - template void execute_impl() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const auto league_size = m_policy.league_size(); const auto team_size = m_policy.team_size(); @@ -105,11 +105,12 @@ class ParallelFor, const size_t shmem_size_L0 = m_policy.scratch_size(0, team_size); const size_t shmem_size_L1 = m_policy.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(team_size, shmem_size_L0, shmem_size_L1, - league_size); + m_policy.space().impl_internal_space_instance()->resize_scratch( + team_size, shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - FunctorType a_functor(m_functor); + void* scratch_ptr = + m_policy.space().impl_internal_space_instance()->get_scratch_ptr(); + auto const a_functor(m_functor); // FIXME_OPENMPTARGET - If the team_size is not a multiple of 32, the // scratch implementation does not work in the Release or RelWithDebugInfo @@ -122,7 +123,7 @@ class ParallelFor, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(m_policy.space().concurrency() / team_size, league_size); #endif // FIXME_OPENMPTARGET: Although the maximum number of teams is set using the @@ -161,16 +162,13 @@ class ParallelFor, typename Policy::member_type team(league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #else #pragma omp target teams distribute firstprivate(a_functor) \ is_device_ptr(scratch_ptr) num_teams(max_active_teams) \ - thread_limit(team_size) + thread_limit(team_size) for (int i = 0; i < league_size; i++) { #pragma omp parallel { @@ -180,10 +178,7 @@ class ParallelFor, typename Policy::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - m_functor(team); - else - m_functor(TagType(), team); + a_functor(team); } } #endif diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp index e86a121974..bee604834c 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_MDRange.hpp @@ -37,9 +37,8 @@ class ParallelReduce; + public: inline void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = FunctorAdapter(m_functor_reducer.get_functor()); execute_tile( - m_functor_reducer.get_functor(), m_policy, m_result_ptr, + functor, m_policy, m_result_ptr, std::integral_constant()); } @@ -77,7 +81,7 @@ class ParallelReduce inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -90,32 +94,23 @@ class ParallelReduce::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -126,7 +121,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -141,38 +136,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -184,7 +170,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -201,40 +187,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -247,7 +222,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -266,26 +241,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -293,18 +260,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -318,7 +280,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateLeft) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -339,27 +301,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -368,19 +322,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i5 = begin_5; i5 < end_5; ++i5) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i0 = begin_0; i0 < end_0; ++i0) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -395,7 +344,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -408,32 +357,23 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(2) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } else { #pragma omp target teams distribute parallel for collapse(2) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { - if constexpr (std::is_void::value) - functor(i0, i1, result); - else - functor(typename Policy::work_tag(), i0, i1, result); + functor(i0, i1, result); } } } @@ -444,7 +384,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -459,38 +399,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join( \ - omp_out, omp_in)) \ - initializer( \ - OpenMPTargetReducerWrapper ::init( \ - omp_priv)) +#pragma omp declare reduction( \ + custom \ +:ValueType : OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper< \ + typename ReducerType::functor_type>::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(3) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } } else { #pragma omp target teams distribute parallel for collapse(3) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { - if constexpr (std::is_void::value) - functor(i0, i1, i2, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, result); + functor(i0, i1, i2, result); } } } @@ -502,7 +433,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -519,40 +450,29 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(4) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } } } else { #pragma omp target teams distribute parallel for collapse(4) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, result); + functor(i0, i1, i2, i3, result); } } } @@ -565,7 +485,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -584,26 +504,18 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(5) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -611,18 +523,13 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(5) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, - result); + functor(i0, i1, i2, i3, i4, result); } } } @@ -636,7 +543,7 @@ reduction(+:result) template inline std::enable_if_t execute_tile( - const FunctorType& functor, const Policy& policy, pointer_type ptr, + const FunctorAdapter& functor, const Policy& policy, pointer_type ptr, OpenMPTargetIterateRight) const { const Index begin_0 = policy.m_lower[0]; const Index begin_1 = policy.m_lower[1]; @@ -657,27 +564,19 @@ reduction(+:result) // FIXME_OPENMPTARGET: Unable to separate directives and their companion // loops which leads to code duplication for different reduction types. if constexpr (UseReducer) { -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for collapse(6) map(to \ - : functor) \ - reduction(custom \ - : result) +#pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ + reduction(custom : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } @@ -686,19 +585,14 @@ reduction(+:result) } } else { #pragma omp target teams distribute parallel for collapse(6) map(to : functor) \ -reduction(+:result) + reduction(+ : result) for (auto i0 = begin_0; i0 < end_0; ++i0) { for (auto i1 = begin_1; i1 < end_1; ++i1) { for (auto i2 = begin_2; i2 < end_2; ++i2) { for (auto i3 = begin_3; i3 < end_3; ++i3) { for (auto i4 = begin_4; i4 < end_4; ++i4) { for (auto i5 = begin_5; i5 < end_5; ++i5) { - if constexpr (std::is_same::value) - functor(i0, i1, i2, i3, i4, i5, result); - else - functor(typename Policy::work_tag(), i0, i1, i2, i3, i4, i5, - result); + functor(i0, i1, i2, i3, i4, i5, result); } } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp index 4a112ed11d..b7c8abcb44 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Range.hpp @@ -21,6 +21,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -33,8 +34,6 @@ class ParallelReduce, using FunctorType = typename CombinedFunctorReducerType::functor_type; using ReducerType = typename CombinedFunctorReducerType::reducer_type; - using WorkTag = typename Policy::work_tag; - using pointer_type = typename ReducerType::pointer_type; using reference_type = typename ReducerType::reference_type; @@ -55,14 +54,17 @@ class ParallelReduce, const pointer_type m_result_ptr; bool m_result_ptr_on_device; const int m_result_ptr_num_elems; - using TagType = typename Policy::work_tag; public: void execute() const { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); + if constexpr (FunctorHasJoin) { // Enter this loop if the Functor has a init-join. ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, @@ -75,26 +77,26 @@ class ParallelReduce, // Enter this loop if the reduction is on an array and the routine is // templated over the size of the array. if (m_result_ptr_num_elems <= 2) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<2>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 4) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<4>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 8) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<8>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 16) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<16>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else if (m_result_ptr_num_elems <= 32) { - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<32>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } else { Kokkos::abort("array reduction length must be <= 32"); } } else { // This loop handles the basic scalar reduction. - ParReduceSpecialize::template execute_array( + ParReduceSpecialize::template execute_array<1>( functor, m_policy, m_result_ptr, m_result_ptr_on_device); } } diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp index 16c0eedb81..b81e3aa7ed 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelReduce_Team.hpp @@ -59,7 +59,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for reduction(+ : TeamThread_scratch[:1]) +#pragma omp for reduction(+ : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -68,7 +68,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for reduction(custom : TeamThread_scratch[:1]) +#pragma omp for reduction(custom : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -90,11 +90,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< const Lambda& lambda, ReducerType result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custominner:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custominner \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) // FIXME_OPENMPTARGET - Make sure that if its an array reduction, number of // elements in the array <= 32. For reduction we allocate, 16 bytes per @@ -109,7 +108,7 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamThread_scratch[0]); #pragma omp barrier -#pragma omp for reduction(custominner : TeamThread_scratch[:1]) +#pragma omp for reduction(custominner : TeamThread_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamThread_scratch[0]); } @@ -132,11 +131,10 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< ValueType* TeamThread_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -145,8 +143,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for reduction(omp_red_teamthread_reducer \ - : TeamThread_scratch[:1]) schedule(static, 1) +#pragma omp for reduction( \ + omp_red_teamthread_reducer : TeamThread_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -259,11 +258,10 @@ parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< const Lambda& lambda, ReducerType const& result) { using ValueType = typename ReducerType::value_type; -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType vector_reduce; Impl::OpenMPTargetReducerWrapper::init(vector_reduce); @@ -329,7 +327,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( #pragma omp barrier if constexpr (std::is_arithmetic::value) { -#pragma omp for simd reduction(+ : TeamVector_scratch[:1]) +#pragma omp for simd reduction(+ : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -338,7 +336,7 @@ KOKKOS_INLINE_FUNCTION void parallel_reduce( } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { ValueType tmp = ValueType(); lambda(i, tmp); @@ -363,11 +361,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< static_assert(sizeof(ValueType) <= Impl::OpenMPTargetExecTeamMember::TEAM_REDUCE_SIZE); -#pragma omp declare reduction( \ - custom:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); @@ -376,7 +373,7 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< Impl::OpenMPTargetReducerWrapper::init(TeamVector_scratch[0]); #pragma omp barrier -#pragma omp for simd reduction(custom : TeamVector_scratch[:1]) +#pragma omp for simd reduction(custom : TeamVector_scratch[ : 1]) for (iType i = loop_boundaries.start; i < loop_boundaries.end; i++) { lambda(i, TeamVector_scratch[0]); } @@ -400,11 +397,10 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< ValueType* TeamVector_scratch = static_cast(loop_boundaries.team.impl_reduce_scratch()); -#pragma omp declare reduction( \ - omp_red_teamthread_reducer:ValueType \ - : Impl::OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer( \ - Impl::OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(omp_red_teamthread_reducer \ +:ValueType : Impl::OpenMPTargetReducerWrapper::join(omp_out, \ + omp_in)) \ + initializer(Impl::OpenMPTargetReducerWrapper::init(omp_priv)) #pragma omp barrier ValueType tmp; @@ -413,8 +409,9 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< #pragma omp barrier iType team_size = iType(omp_get_num_threads()); -#pragma omp for simd reduction(omp_red_teamthread_reducer \ - : TeamVector_scratch[:1]) schedule(static, 1) +#pragma omp for simd reduction( \ + omp_red_teamthread_reducer : TeamVector_scratch[ : 1]) \ + schedule(static, 1) for (iType t = 0; t < team_size; t++) { ValueType tmp2; result.init(tmp2); @@ -443,8 +440,7 @@ class ParallelReduce scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); - const FunctorType& functor = m_functor_reducer.get_functor(); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); + auto const functor = + Kokkos::Experimental::Impl::FunctorAdapter( + m_functor_reducer.get_functor()); if constexpr (FunctorHasJoin) { ParReduceSpecialize::execute_init_join(functor, m_policy, m_result_ptr, m_result_ptr_on_device); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp index 29df0163c8..ec8a96cb2f 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_ParallelScan_Range.hpp @@ -20,6 +20,7 @@ #include #include #include +#include namespace Kokkos { namespace Impl { @@ -30,7 +31,6 @@ class ParallelScan, protected: using Policy = Kokkos::RangePolicy; - using WorkTag = typename Policy::work_tag; using Member = typename Policy::member_type; using idx_type = typename Policy::index_type; @@ -48,18 +48,8 @@ class ParallelScan, value_type* m_result_ptr; const bool m_result_ptr_device_accessible; - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(idx, val, is_final); - } - template - std::enable_if_t::value> call_with_tag( - const FunctorType& f, const idx_type& idx, value_type& val, - const bool& is_final) const { - f(WorkTag(), idx, val, is_final); - } + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; public: void impl_execute( @@ -77,8 +67,10 @@ class ParallelScan, idx_type team_size = 128; auto a_functor_reducer = m_functor_reducer; -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) + auto a_functor = FunctorAdapter(m_functor_reducer.get_functor()); + +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -91,9 +83,8 @@ class ParallelScan, const idx_type idx = local_offset + i; value_type val; reducer.init(&val); - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, val, - false); + if (idx < N) a_functor(idx, val, false); + element_values(team_id, i) = val; } #pragma omp barrier @@ -120,9 +111,8 @@ class ParallelScan, } } -#pragma omp target teams distribute map(to \ - : a_functor_reducer) num_teams(nteams) \ - thread_limit(team_size) +#pragma omp target teams distribute map(to : a_functor_reducer, a_functor) \ + num_teams(nteams) thread_limit(team_size) for (idx_type team_id = 0; team_id < n_chunks; ++team_id) { const typename Analysis::Reducer& reducer = a_functor_reducer.get_reducer(); @@ -145,12 +135,7 @@ class ParallelScan, #if defined(KOKKOS_ARCH_AMD_GPU) && !defined(KOKKOS_ARCH_AMD_GFX1030) && \ !defined(KOKKOS_ARCH_AMD_GFX1100) && !defined(KOKKOS_ARCH_AMD_GFX1103) if constexpr (Analysis::Reducer::has_join_member_function()) { - if constexpr (std::is_void_v) - a_functor_reducer.get_functor().join(local_offset_value, - offset_value); - else - a_functor_reducer.get_functor().join( - WorkTag{}, local_offset_value, offset_value); + a_functor.get_functor().join(local_offset_value, offset_value); } else local_offset_value += offset_value; #else @@ -158,9 +143,8 @@ class ParallelScan, #endif } else local_offset_value = offset_value; - if (idx < N) - call_with_tag(a_functor_reducer.get_functor(), idx, - local_offset_value, true); + if (idx < N) a_functor(idx, local_offset_value, true); + if (idx == N - 1 && m_result_ptr_device_accessible) *m_result_ptr = local_offset_value; } @@ -169,9 +153,9 @@ class ParallelScan, } void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const idx_type N = m_policy.end() - m_policy.begin(); const idx_type chunk_size = 128; @@ -179,7 +163,7 @@ class ParallelScan, // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + m_policy.space().impl_internal_space_instance()->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View, public: void execute() const { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget parallel_for"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget parallel_for"); const int64_t N = base_t::m_policy.end() - base_t::m_policy.begin(); const int chunk_size = 128; @@ -231,7 +215,9 @@ class ParallelScanWithTotal, if (N > 0) { // Only let one ParallelReduce instance at a time use the scratch memory. std::scoped_lock scratch_memory_lock( - OpenMPTargetExec::m_mutex_scratch_ptr); + base_t::m_policy.space() + .impl_internal_space_instance() + ->m_mutex_scratch_ptr); // This could be scratch memory per team Kokkos::View #include #include +#include namespace Kokkos { namespace Impl { @@ -72,7 +73,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = Kokkos::RangePolicy; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -82,12 +82,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:reducer"); const auto begin = p.begin(); @@ -104,33 +107,27 @@ struct ParallelReduceSpecialize, return; } -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), ptr_on_device); } - template - static void execute_array(const FunctorType& f, const PolicyType& p, + template + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:array_reduction"); const auto begin = p.begin(); @@ -150,27 +147,14 @@ struct ParallelReduceSpecialize, // Case where reduction is on a native data type. if constexpr (std::is_arithmetic::value) { -#pragma omp target teams distribute parallel for \ - map(to:f) reduction(+: result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result) + for (auto i = begin; i < end; ++i) f(i, result); } else { #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) -#pragma omp target teams distribute parallel for map(to \ - : f) reduction(custom \ - : result) - for (auto i = begin; i < end; ++i) - - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(custom : result) + for (auto i = begin; i < end; ++i) f(i, result); } ParReduceCopy::memcpy_result(result_ptr, &result, sizeof(ValueType), @@ -186,13 +170,10 @@ struct ParallelReduceSpecialize, ptr_on_device); return; } -#pragma omp target teams distribute parallel for map(to:f) reduction(+:result[:NumReductions]) +#pragma omp target teams distribute parallel for map(to : f) \ + reduction(+ : result[ : NumReductions]) for (auto i = begin; i < end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } ParReduceCopy::memcpy_result( @@ -200,12 +181,12 @@ struct ParallelReduceSpecialize, } } - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget RangePolicy " "parallel_reduce:init_join"); const auto begin = p.begin(); @@ -219,23 +200,25 @@ struct ParallelReduceSpecialize, const auto size = end - begin; - // FIXME_OPENMPTARGET: The team size and MAX_ACTIVE_THREADS are currently + // FIXME_OPENMPTARGET: The team size and concurrency are currently // based on NVIDIA-V100 and should be modifid to be based on the // architecture in the future. const int max_team_threads = 32; const int max_teams = - OpenMPTargetExec::MAX_ACTIVE_THREADS / max_team_threads; + p.space().impl_internal_space_instance()->concurrency() / + max_team_threads; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. Achieved by setting the first // parameter of `resize_scratch=1`. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - std::numeric_limits::max()); - ValueType* scratch_ptr = - static_cast(OpenMPTargetExec::get_scratch_ptr()); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), + std::numeric_limits::max()); + ValueType* scratch_ptr = static_cast( + p.space().impl_internal_space_instance()->get_scratch_ptr()); - typename FunctorAnalysis::Reducer final_reducer(f); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { #pragma omp target map(to : final_reducer) is_device_ptr(scratch_ptr) @@ -260,8 +243,7 @@ struct ParallelReduceSpecialize, } #pragma omp target teams num_teams(max_teams) thread_limit(max_team_threads) \ - map(to \ - : final_reducer) is_device_ptr(scratch_ptr) + map(to : final_reducer) is_device_ptr(scratch_ptr) { #pragma omp parallel { @@ -279,11 +261,7 @@ struct ParallelReduceSpecialize, // Accumulate partial results in thread specific storage. #pragma omp for simd for (auto i = team_begin; i < team_end; ++i) { - if constexpr (std::is_void_v) { - f(i, result); - } else { - f(TagType(), i, result); - } + f(i, result); } // Reduce all paritial results within a team. @@ -304,8 +282,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { -#pragma omp target teams distribute parallel for simd map(to \ - : f) \ +#pragma omp target teams distribute parallel for simd map(to : f) \ is_device_ptr(scratch_ptr) for (int i = 0; i < max_teams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { @@ -344,7 +321,6 @@ template , ReducerType, PointerType, ValueType> { using PolicyType = TeamPolicyInternal; - using TagType = typename PolicyType::work_tag; using ReducerTypeFwd = std::conditional_t::value, FunctorType, ReducerType>; @@ -355,12 +331,15 @@ struct ParallelReduceSpecialize, using ParReduceCopy = ParallelReduceCopy; - static void execute_reducer(const FunctorType& f, const PolicyType& p, + using FunctorAdapter = + Kokkos::Experimental::Impl::FunctorAdapter; + + static void execute_reducer(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:reducer"); @@ -370,9 +349,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); ValueType result = ValueType(); @@ -383,16 +364,15 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. if (max_active_teams <= 0) return; -#pragma omp declare reduction( \ - custom:ValueType \ - : OpenMPTargetReducerWrapper ::join(omp_out, omp_in)) \ - initializer(OpenMPTargetReducerWrapper ::init(omp_priv)) +#pragma omp declare reduction(custom \ +:ValueType : OpenMPTargetReducerWrapper::join(omp_out, omp_in)) \ + initializer(OpenMPTargetReducerWrapper::init(omp_priv)) #if !defined(KOKKOS_IMPL_OPENMPTARGET_HIERARCHICAL_INTEL_GPU) KOKKOS_IMPL_OMPTARGET_PRAGMA( @@ -414,16 +394,13 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #else #pragma omp target teams distribute firstprivate(f) is_device_ptr(scratch_ptr) \ - num_teams(max_active_teams) thread_limit(team_size) reduction(custom \ - : result) + num_teams(max_active_teams) thread_limit(team_size) \ + reduction(custom : result) for (int i = 0; i < league_size; i++) { #pragma omp parallel reduction(custom : result) { @@ -433,10 +410,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team(i, league_size, team_size, vector_length, scratch_ptr, i, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } #endif @@ -447,12 +421,12 @@ struct ParallelReduceSpecialize, } template - static void execute_array(const FunctorType& f, const PolicyType& p, + static void execute_array(const FunctorAdapter& f, const PolicyType& p, PointerType result_ptr, bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:array_reduction"); @@ -462,9 +436,11 @@ struct ParallelReduceSpecialize, const size_t shmem_size_L0 = p.scratch_size(0, team_size); const size_t shmem_size_L1 = p.scratch_size(1, team_size); - OpenMPTargetExec::resize_scratch(PolicyType::member_type::TEAM_REDUCE_SIZE, - shmem_size_L0, shmem_size_L1, league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); + p.space().impl_internal_space_instance()->resize_scratch( + PolicyType::member_type::TEAM_REDUCE_SIZE, shmem_size_L0, shmem_size_L1, + league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); // Maximum active teams possible. // FIXME_OPENMPTARGET: Cray compiler did not yet implement @@ -473,7 +449,7 @@ struct ParallelReduceSpecialize, int max_active_teams = omp_get_max_teams(); #else int max_active_teams = - std::min(OpenMPTargetExec::MAX_ACTIVE_THREADS / team_size, league_size); + std::min(p.space().concurrency() / team_size, league_size); #endif // If the league size is <=0, do not launch the kernel. @@ -504,19 +480,14 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } else { // Case where the reduction is on a non-native data type. #pragma omp declare reduction(custom:ValueType : omp_out += omp_in) #pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ - map(to \ - : f) is_device_ptr(scratch_ptr) reduction(custom \ - : result) + map(to : f) is_device_ptr(scratch_ptr) reduction(custom : result) #pragma omp parallel reduction(custom : result) { if (omp_get_num_teams() > max_active_teams) @@ -531,10 +502,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } } @@ -545,10 +513,10 @@ struct ParallelReduceSpecialize, } else { ValueType result[NumReductions] = {}; // Case where the reduction is on an array. -#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) map(to \ - : f) \ - is_device_ptr(scratch_ptr) reduction(+ : result[:NumReductions]) -#pragma omp parallel reduction(+ : result[:NumReductions]) +#pragma omp target teams num_teams(max_active_teams) thread_limit(team_size) \ + map(to : f) is_device_ptr(scratch_ptr) \ + reduction(+ : result[ : NumReductions]) +#pragma omp parallel reduction(+ : result[ : NumReductions]) { if (omp_get_num_teams() > max_active_teams) Kokkos::abort("`omp_set_num_teams` call was not respected.\n"); @@ -562,10 +530,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, blockIdx, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) - f(team, result); - else - f(TagType(), team, result); + f(team, result); } } @@ -577,12 +542,12 @@ struct ParallelReduceSpecialize, // FIXME_OPENMPTARGET : This routine is a copy from `parallel_reduce` over // RangePolicy. Need a new implementation. - static void execute_init_join(const FunctorType& f, const PolicyType& p, + static void execute_init_join(const FunctorAdapter& f, const PolicyType& p, PointerType ptr, const bool ptr_on_device) { - OpenMPTargetExec::verify_is_process( + Experimental::Impl::OpenMPTargetInternal::verify_is_process( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join "); - OpenMPTargetExec::verify_initialized( + Experimental::Impl::OpenMPTargetInternal::verify_initialized( "Kokkos::Experimental::OpenMPTarget TeamPolicy " "parallel_reduce:init_join"); using FunctorAnalysis = @@ -611,13 +576,14 @@ struct ParallelReduceSpecialize, const auto nteams = league_size; // Number of elements in the reduction - const auto value_count = FunctorAnalysis::value_count(f); + const auto value_count = FunctorAnalysis::value_count(f.get_functor()); // Allocate scratch per active thread. - OpenMPTargetExec::resize_scratch(1, 0, value_count * sizeof(ValueType), - league_size); - void* scratch_ptr = OpenMPTargetExec::get_scratch_ptr(); - typename FunctorAnalysis::Reducer final_reducer(f); + p.space().impl_internal_space_instance()->resize_scratch( + 1, 0, value_count * sizeof(ValueType), league_size); + void* scratch_ptr = + p.space().impl_internal_space_instance()->get_scratch_ptr(); + typename FunctorAnalysis::Reducer final_reducer(f.get_functor()); if (end <= begin) { // If there is no work to be done, copy back the initialized values and @@ -661,11 +627,7 @@ struct ParallelReduceSpecialize, typename PolicyType::member_type team( league_id, league_size, team_size, vector_length, scratch_ptr, team_num, shmem_size_L0, shmem_size_L1); - if constexpr (std::is_void_v) { - f(team, result); - } else { - f(TagType(), team, result); - } + f(team, result); } } // end parallel } // end target @@ -673,7 +635,7 @@ struct ParallelReduceSpecialize, int tree_neighbor_offset = 1; do { #pragma omp target teams distribute parallel for simd firstprivate( \ - final_reducer) is_device_ptr(scratch_ptr) + final_reducer) is_device_ptr(scratch_ptr) for (int i = 0; i < nteams - tree_neighbor_offset; i += 2 * tree_neighbor_offset) { ValueType* team_scratch = static_cast(scratch_ptr); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp index 9b578aca11..4308fb042a 100644 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp +++ b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Reducer.hpp @@ -34,9 +34,6 @@ struct OpenMPTargetReducerWrapper { KOKKOS_INLINE_FUNCTION static void join(value_type&, const value_type&) = delete; - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type&, const volatile value_type&) = delete; - KOKKOS_INLINE_FUNCTION static void init(value_type&) = delete; }; @@ -51,11 +48,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest += src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest += src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::sum(); @@ -72,11 +64,6 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { dest *= src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest *= src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::prod(); @@ -95,11 +82,6 @@ struct OpenMPTargetReducerWrapper> { if (src < dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src < dest) dest = src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::min(); @@ -118,11 +100,6 @@ struct OpenMPTargetReducerWrapper> { if (src > dest) dest = src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src > dest) dest = src; - } - // Required KOKKOS_INLINE_FUNCTION static void init(value_type& val) { @@ -141,11 +118,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest && src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest && src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::land(); @@ -166,11 +138,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest || src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest || src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::lor(); @@ -189,11 +156,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest & src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest & src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::band(); @@ -212,11 +174,6 @@ struct OpenMPTargetReducerWrapper> { dest = dest | src; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest = dest | src; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val = reduction_identity::bor(); @@ -236,12 +193,12 @@ struct OpenMPTargetReducerWrapper> { // Required KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val < dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) dest = src; + if (src.val < dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -263,12 +220,12 @@ struct OpenMPTargetReducerWrapper> { KOKKOS_INLINE_FUNCTION static void join(value_type& dest, const value_type& src) { - if (src.val > dest.val) dest = src; - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val > dest.val) dest = src; + if (src.val > dest.val) + dest = src; + else if (src.val == dest.val && + dest.loc == reduction_identity::min()) { + dest.loc = src.loc; + } } KOKKOS_INLINE_FUNCTION @@ -298,16 +255,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -331,22 +278,16 @@ struct OpenMPTargetReducerWrapper> { if (src.min_val < dest.min_val) { dest.min_val = src.min_val; dest.min_loc = src.min_loc; - } - if (src.max_val > dest.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } - } - - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; + } else if (dest.min_val == src.min_val && + dest.min_loc == reduction_identity::min()) { dest.min_loc = src.min_loc; } if (src.max_val > dest.max_val) { dest.max_val = src.max_val; dest.max_loc = src.max_loc; + } else if (dest.max_val == src.max_val && + dest.max_loc == reduction_identity::min()) { + dest.max_loc = src.max_loc; } } @@ -385,15 +326,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (dest.val < src.val) { - dest = src; - } else if (!(src.val < dest.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::max(); @@ -428,15 +360,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.val < dest.val) { - dest = src; - } else if (!(dest.val < src.val)) { - dest.loc = (src.loc < dest.loc) ? src.loc : dest.loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.val = reduction_identity::min(); @@ -480,23 +403,6 @@ struct OpenMPTargetReducerWrapper> { } } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - if (src.min_val < dest.min_val) { - dest.min_val = src.min_val; - dest.min_loc = src.min_loc; - } else if (!(dest.min_val < src.min_val)) { - dest.min_loc = (src.min_loc < dest.min_loc) ? src.min_loc : dest.min_loc; - } - - if (dest.max_val < src.max_val) { - dest.max_val = src.max_val; - dest.max_loc = src.max_loc; - } else if (!(src.max_val < dest.max_val)) { - dest.max_loc = (src.max_loc > dest.max_loc) ? src.max_loc : dest.max_loc; - } - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_val = reduction_identity::max(); @@ -531,13 +437,6 @@ struct OpenMPTargetReducerWrapper> { : dest.min_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_true = (src.min_loc_true < dest.min_loc_true) - ? src.min_loc_true - : dest.min_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_true = reduction_identity::min(); @@ -569,13 +468,6 @@ struct OpenMPTargetReducerWrapper> { : dest.max_loc_true; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (src.max_loc_true > dest.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = reduction_identity::max(); @@ -611,17 +503,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.max_loc_true = (dest.max_loc_true < src.max_loc_true) - ? src.max_loc_true - : dest.max_loc_true; - - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.max_loc_true = ::Kokkos::reduction_identity::max(); @@ -654,13 +535,6 @@ struct OpenMPTargetReducerWrapper> { : src.min_loc_false; } - KOKKOS_INLINE_FUNCTION - static void join(volatile value_type& dest, const volatile value_type& src) { - dest.min_loc_false = (dest.min_loc_false < src.min_loc_false) - ? dest.min_loc_false - : src.min_loc_false; - } - KOKKOS_INLINE_FUNCTION static void init(value_type& val) { val.min_loc_false = ::Kokkos::reduction_identity::min(); diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp deleted file mode 100644 index 458c4c9a43..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.cpp +++ /dev/null @@ -1,251 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE -#define KOKKOS_IMPL_PUBLIC_INCLUDE -#endif - -#include - -#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_ENABLE_TASKPOLICY) - -#include - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template class TaskQueue; - -//---------------------------------------------------------------------------- - -TaskExec::TaskExec() - : m_self_exec(0), - m_team_exec(0), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(0), - m_team_rank(0), - m_team_size(1) {} - -TaskExec::TaskExec( - Kokkos::Impl::OpenMPTargetExec &arg_exec, int const arg_team_size) - : m_self_exec(&arg_exec), - m_team_exec(arg_exec.pool_rev(arg_exec.pool_rank_rev() / arg_team_size)), - m_sync_mask(0), - m_sync_value(0), - m_sync_step(0), - m_group_rank(arg_exec.pool_rank_rev() / arg_team_size), - m_team_rank(arg_exec.pool_rank_rev() % arg_team_size), - m_team_size(arg_team_size) { - // This team spans - // m_self_exec->pool_rev( team_size * group_rank ) - // m_self_exec->pool_rev( team_size * ( group_rank + 1 ) - 1 ) - - int64_t volatile *const sync = (int64_t *)m_self_exec->scratch_reduce(); - - sync[0] = int64_t(0); - sync[1] = int64_t(0); - - for (int i = 0; i < m_team_size; ++i) { - m_sync_value |= int64_t(1) << (8 * i); - m_sync_mask |= int64_t(3) << (8 * i); - } - - Kokkos::memory_fence(); -} - -void TaskExec::team_barrier_impl() const { - if (m_team_exec->scratch_reduce_size() < int(2 * sizeof(int64_t))) { - Kokkos::abort("TaskQueue scratch_reduce memory too small"); - } - - // Use team shared memory to synchronize. - // Alternate memory locations between barriers to avoid a sequence - // of barriers overtaking one another. - - int64_t volatile *const sync = - ((int64_t *)m_team_exec->scratch_reduce()) + (m_sync_step & 0x01); - - // This team member sets one byte within the sync variable - int8_t volatile *const sync_self = ((int8_t *)sync) + m_team_rank; - - *sync_self = int8_t(m_sync_value & 0x03); // signal arrival - - while (m_sync_value != *sync) - ; // wait for team to arrive - - ++m_sync_step; - - if (0 == (0x01 & m_sync_step)) { // Every other step - m_sync_value ^= m_sync_mask; - if (1000 < m_sync_step) m_sync_step = 0; - } -} - -//---------------------------------------------------------------------------- - -void TaskQueueSpecialization::execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - using Member = TaskExec; - - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - // Required: team_size <= 8 - - const int team_size = PoolExec::pool_size(2); // Threads per core - // const int team_size = PoolExec::pool_size(1); // Threads per NUMA - - if (8 < team_size) { - Kokkos::abort("TaskQueue unsupported team size"); - } - -#pragma omp parallel - { - PoolExec &self = *PoolExec::get_thread_omp(); - - Member single_exec; - Member team_exec(self, team_size); - - // Team shared memory - task_root_type *volatile *const task_shared = - (task_root_type **)team_exec.m_team_exec->scratch_thread(); - -// Barrier across entire OpenMPTarget thread pool to insure initialization -#pragma omp barrier - - // Loop until all queues are empty and no tasks in flight - - do { - task_root_type *task = 0; - - // Each team lead attempts to acquire either a thread team task - // or a single thread task for the team. - - if (0 == team_exec.team_rank()) { - task = 0 < *((volatile int *)&queue->m_ready_count) ? end : 0; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - } - - // Team lead broadcast acquired task to team members: - - if (1 < team_exec.team_size()) { - if (0 == team_exec.team_rank()) *task_shared = task; - - // Fence to be sure task_shared is stored before the barrier - Kokkos::memory_fence(); - - // Whole team waits for every team member to reach this statement - team_exec.team_barrier(); - - // Fence to be sure task_shared is stored - Kokkos::memory_fence(); - - task = *task_shared; - } - - if (0 == task) break; // 0 == m_ready_count - - if (end == task) { - // All team members wait for whole team to reach this statement. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } else if (task_root_type::TaskTeam == task->m_task_type) { - // Thread Team Task - (*task->m_apply)(task, &team_exec); - - // The m_apply function performs a barrier - - if (0 == team_exec.team_rank()) { - // team member #0 completes the task, which may delete the task - queue->complete(task); - } - } else { - // Single Thread Task - - if (0 == team_exec.team_rank()) { - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - } - - // All team members wait for whole team to reach this statement. - // Not necessary to complete the task. - // Is necessary to prevent task_shared from being updated - // before it is read by all threads. - team_exec.team_barrier(); - } - } while (1); - } - // END #pragma omp parallel -} - -void TaskQueueSpecialization:: - iff_single_thread_recursive_execute( - TaskQueue *const queue) { - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = TaskQueue; - using task_root_type = TaskBase; - using Member = TaskExec; - - if (1 == omp_get_num_threads()) { - task_root_type *const end = (task_root_type *)task_root_type::EndTag; - - Member single_exec; - - task_root_type *task = end; - - do { - task = end; - - // Loop by priority and then type - for (int i = 0; i < queue_type::NumQueue && end == task; ++i) { - for (int j = 0; j < 2 && end == task; ++j) { - task = queue_type::pop_task(&queue->m_ready[i][j]); - } - } - - if (end == task) break; - - (*task->m_apply)(task, &single_exec); - - queue->complete(task); - - } while (1); - } -} - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_OPENMPTARGET ) && defined( \ - KOKKOS_ENABLE_TASKPOLICY ) */ diff --git a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp b/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp deleted file mode 100644 index c9aa7b128f..0000000000 --- a/lib/kokkos/core/src/OpenMPTarget/Kokkos_OpenMPTarget_Task.hpp +++ /dev/null @@ -1,319 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_OPENMP_TASK_HPP -#define KOKKOS_IMPL_OPENMP_TASK_HPP - -#if defined(KOKKOS_ENABLE_TASKPOLICY) - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { -namespace Impl { - -template <> -class TaskQueueSpecialization { - public: - using execution_space = Kokkos::Experimental::OpenMPTarget; - using queue_type = Kokkos::Impl::TaskQueue; - using task_base_type = Kokkos::Impl::TaskBase; - - // Must specify memory space - using memory_space = Kokkos::HostSpace; - - static void iff_single_thread_recursive_execute(queue_type* const); - - // Must provide task queue execution function - static void execute(queue_type* const); - - // Must provide mechanism to set function pointer in - // execution space from the host process. - template - static void proc_set_apply(task_base_type::function_type* ptr) { - using TaskType = TaskBase; - *ptr = TaskType::apply; - } -}; - -extern template class TaskQueue; - -//---------------------------------------------------------------------------- - -template <> -class TaskExec { - private: - TaskExec(TaskExec&&) = delete; - TaskExec(TaskExec const&) = delete; - TaskExec& operator=(TaskExec&&) = delete; - TaskExec& operator=(TaskExec const&) = delete; - - using PoolExec = Kokkos::Impl::OpenMPTargetExec; - - friend class Kokkos::Impl::TaskQueue; - friend class Kokkos::Impl::TaskQueueSpecialization< - Kokkos::Experimental::OpenMPTarget>; - - PoolExec* const m_self_exec; ///< This thread's thread pool data structure - PoolExec* const m_team_exec; ///< Team thread's thread pool data structure - int64_t m_sync_mask; - int64_t mutable m_sync_value; - int mutable m_sync_step; - int m_group_rank; ///< Which "team" subset of thread pool - int m_team_rank; ///< Which thread within a team - int m_team_size; - - TaskExec(); - TaskExec(PoolExec& arg_exec, int arg_team_size); - - void team_barrier_impl() const; - - public: - KOKKOS_FUNCTION void* team_shared() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread() : nullptr;)) - - KOKKOS_IF_ON_DEVICE((return nullptr;)) - } - - KOKKOS_FUNCTION int team_shared_size() const { - KOKKOS_IF_ON_HOST( - (return m_team_exec ? m_team_exec->scratch_thread_size() : 0;)) - - KOKKOS_IF_ON_DEVICE((return 0;)) - } - - /**\brief Whole team enters this function call - * before any teeam member returns from - * this function call. - */ - KOKKOS_FUNCTION void team_barrier() const { - KOKKOS_IF_ON_HOST((if (1 < m_team_size) { team_barrier_impl(); })) - } - - KOKKOS_INLINE_FUNCTION - int team_rank() const { return m_team_rank; } - - KOKKOS_INLINE_FUNCTION - int team_size() const { return m_team_size; } -}; - -} // namespace Impl -} // namespace Kokkos - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -namespace Kokkos { - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& count) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, - count); -} - -template -KOKKOS_INLINE_FUNCTION Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec > -TeamThreadRange(Impl::TaskExec& thread, - const iType& start, const iType& end) { - return Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >(thread, start, - end); -} - -/** \brief Inter-thread parallel_for. Executes lambda(iType i) for each - * i=0..N-1. - * - * The range i=0..N-1 is mapped to all threads of the the calling thread team. - */ -template -KOKKOS_INLINE_FUNCTION void parallel_for( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i); - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - shared[0] += shared[i]; - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - ValueType result = initialized_result; - - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - lambda(i, result); - } - - if (1 < loop_boundaries.thread.team_size()) { - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - - loop_boundaries.thread.team_barrier(); - shared[team_rank] = result; - - loop_boundaries.thread.team_barrier(); - - // reduce across threads to thread 0 - if (team_rank == 0) { - for (int i = 1; i < loop_boundaries.thread.team_size(); i++) { - join(shared[0], shared[i]); - } - } - - loop_boundaries.thread.team_barrier(); - - // broadcast result - initialized_result = shared[0]; - } else { - initialized_result = result; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, ValueType& initialized_result) {} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_reduce( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda, const JoinType& join, ValueType& initialized_result) { -} - -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::TeamThreadRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) { - ValueType accum = 0; - ValueType val, local_total; - ValueType* shared = (ValueType*)loop_boundaries.thread.team_shared(); - int team_size = loop_boundaries.thread.team_size(); - int team_rank = - loop_boundaries.thread.team_rank(); // member num within the team - - // Intra-member scan - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } - - shared[team_rank] = accum; - loop_boundaries.thread.team_barrier(); - - // Member 0 do scan on accumulated totals - if (team_rank == 0) { - for (iType i = 1; i < team_size; i += 1) { - shared[i] += shared[i - 1]; - } - accum = 0; // Member 0 set accum to 0 in preparation for inter-member scan - } - - loop_boundaries.thread.team_barrier(); - - // Inter-member scan adding in accumulated totals - if (team_rank != 0) { - accum = shared[team_rank - 1]; - } - for (iType i = loop_boundaries.start; i < loop_boundaries.end; - i += loop_boundaries.increment) { - local_total = 0; - lambda(i, local_total, false); - val = accum; - lambda(i, val, true); - accum += local_total; - } -} - -// placeholder for future function -template -KOKKOS_INLINE_FUNCTION void parallel_scan( - const Impl::ThreadVectorRangeBoundariesStruct< - iType, Impl::TaskExec >& - loop_boundaries, - const Lambda& lambda) {} - -} /* namespace Kokkos */ - -//---------------------------------------------------------------------------- -//---------------------------------------------------------------------------- - -#endif /* #if defined( KOKKOS_ENABLE_TASKPOLICY ) */ -#endif /* #ifndef KOKKOS_IMPL_OPENMP_TASK_HPP */ diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp index 4de6931918..2583a1cdc0 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.cpp @@ -46,7 +46,6 @@ struct Container { } // namespace namespace Kokkos { -namespace Experimental { SYCL::SYCL() : m_space_instance(&Impl::SYCLInternal::singleton(), [](Impl::SYCLInternal*) {}) { @@ -100,6 +99,11 @@ void SYCL::print_configuration(std::ostream& os, bool verbose) const { #else os << "macro KOKKOS_IMPL_SYCL_DEVICE_GLOBAL_SUPPORTED : undefined\n"; #endif +#ifdef KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : defined\n"; +#else + os << "macro KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE : undefined\n"; +#endif #ifdef SYCL_EXT_ONEAPI_DEVICE_GLOBAL os << "macro SYCL_EXT_ONEAPI_DEVICE_GLOBAL : defined\n"; #else @@ -172,8 +176,7 @@ void SYCL::fence(const std::string& name) const { } void SYCL::impl_static_fence(const std::string& name) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::SpecialSynchronizationCases:: GlobalDeviceSynchronization, @@ -261,8 +264,6 @@ std::ostream& SYCL::impl_sycl_info(std::ostream& os, << device.get_info() << "\nImage Max Buffer Size: " << device.get_info() - << "\nImage Max Array Size: " - << device.get_info() << "\nMax Samplers: " << device.get_info() << "\nMax Parameter Size: " << device.get_info() @@ -317,5 +318,4 @@ int g_sycl_space_factory_initialized = Kokkos::Impl::initialize_space_factory("170_SYCL"); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp index 0f3d1f0994..937dcceab4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL.hpp @@ -39,7 +39,6 @@ static_assert(false, #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal; } @@ -91,9 +90,8 @@ class SYCL { /** \brief Wait until all dispatched functors complete. A noop for OpenMP. */ static void impl_static_fence(const std::string& name); - void fence( - const std::string& name = - "Kokkos::Experimental::SYCL::fence: Unnamed Instance Fence") const; + void fence(const std::string& name = + "Kokkos::SYCL::fence: Unnamed Instance Fence") const; /// \brief Print configuration information to the given output stream. void print_configuration(std::ostream& os, bool verbose = false) const; @@ -131,15 +129,13 @@ class SYCL { Kokkos::Impl::HostSharedPtr m_space_instance; }; -} // namespace Experimental - namespace Tools { namespace Experimental { template <> -struct DeviceTypeTraits { +struct DeviceTypeTraits { /// \brief An ID to differentiate (for example) Serial from OpenMP in Tooling static constexpr DeviceType id = DeviceType::SYCL; - static int device_id(const Kokkos::Experimental::SYCL& exec) { + static int device_id(const Kokkos::SYCL& exec) { return exec.impl_internal_space_instance()->m_syclDev; } }; @@ -185,10 +181,11 @@ std::vector partition_space(const SYCL& sycl_space, return instances; } +} // namespace Experimental + namespace Impl { std::vector get_sycl_devices(); } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp index afc7eebd38..a9e2eca4fb 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_DeepCopy.hpp @@ -28,37 +28,34 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n); -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n); +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n); void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n); template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; template -struct DeepCopy::value && is_sycl_type_space::value>> { DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } - DeepCopy(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { + DeepCopy(const Kokkos::SYCL& instance, void* dst, const void* src, size_t n) { DeepCopyAsyncSYCL(instance, dst, src, n); } }; @@ -66,10 +63,9 @@ struct DeepCopy struct DeepCopy< MemSpace1, MemSpace2, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + is_sycl_type_space::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -93,9 +89,8 @@ struct DeepCopy< template struct DeepCopy< MemSpace, HostSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } @@ -118,9 +113,8 @@ struct DeepCopy< template struct DeepCopy< HostSpace, MemSpace, ExecutionSpace, - std::enable_if_t< - is_sycl_type_space::value && - !std::is_same::value>> { + std::enable_if_t::value && + !std::is_same::value>> { inline DeepCopy(void* dst, const void* src, size_t n) { DeepCopySYCL(dst, src, n); } diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp index 9c39df9415..54ca645995 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNodeKernel.hpp @@ -32,30 +32,29 @@ namespace Impl { template -class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag< - PatternTag, Functor, PolicyType, Args..., - Kokkos::Experimental::SYCL>::type { +class GraphNodeKernelImpl + : public PatternImplSpecializationFromTag::type { public: using Policy = PolicyType; using graph_kernel = GraphNodeKernelImpl; - using base_t = typename PatternImplSpecializationFromTag< - PatternTag, Functor, Policy, Args..., Kokkos::Experimental::SYCL>::type; + using base_t = + typename PatternImplSpecializationFromTag::type; // TODO use the name and executionspace template - GraphNodeKernelImpl(std::string, Kokkos::Experimental::SYCL const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + GraphNodeKernelImpl(std::string, Kokkos::SYCL const&, Functor arg_functor, + PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + : base_t(std::move(arg_functor), (PolicyDeduced&&)arg_policy, + (ArgsDeduced&&)args...) {} template - GraphNodeKernelImpl(Kokkos::Experimental::SYCL const& exec_space, - Functor arg_functor, PolicyDeduced&& arg_policy) + GraphNodeKernelImpl(Kokkos::SYCL const& exec_space, Functor arg_functor, + PolicyDeduced&& arg_policy) : GraphNodeKernelImpl("", exec_space, std::move(arg_functor), - (PolicyDeduced &&) arg_policy) {} + (PolicyDeduced&&)arg_policy) {} void set_sycl_graph_ptr( sycl::ext::oneapi::experimental::command_graph< @@ -102,14 +101,14 @@ template ::type> struct get_graph_node_kernel_type - : type_identity> {}; + : type_identity< + GraphNodeKernelImpl> {}; template struct get_graph_node_kernel_type : type_identity, Kokkos::ParallelReduceTag>> {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp index 6bbe6711a2..828f1cacb4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_GraphNode_Impl.hpp @@ -28,7 +28,7 @@ namespace Kokkos { namespace Impl { template <> -struct GraphNodeBackendSpecificDetails { +struct GraphNodeBackendSpecificDetails { std::optional node; explicit GraphNodeBackendSpecificDetails() = default; @@ -38,16 +38,16 @@ struct GraphNodeBackendSpecificDetails { }; template -struct GraphNodeBackendDetailsBeforeTypeErasure { +struct GraphNodeBackendDetailsBeforeTypeErasure { protected: GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, Kernel &, PredecessorRef const &, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, Kernel &, PredecessorRef const &, + GraphNodeBackendSpecificDetails &) noexcept {} GraphNodeBackendDetailsBeforeTypeErasure( - Kokkos::Experimental::SYCL const &, _graph_node_is_root_ctor_tag, - GraphNodeBackendSpecificDetails &) noexcept {} + Kokkos::SYCL const &, _graph_node_is_root_ctor_tag, + GraphNodeBackendSpecificDetails &) noexcept {} }; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp index 1dc4a9c997..dc63052dd7 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Graph_Impl.hpp @@ -31,29 +31,28 @@ namespace Kokkos { namespace Impl { template <> -class GraphImpl { +class GraphImpl { public: - using node_details_t = - GraphNodeBackendSpecificDetails; - using root_node_impl_t = GraphNodeImpl; + using node_details_t = GraphNodeBackendSpecificDetails; + using root_node_impl_t = + GraphNodeImpl; using aggregate_kernel_impl_t = SYCLGraphNodeAggregateKernel; using aggregate_node_impl_t = - GraphNodeImpl; // Not movable or copyable; it spends its whole life as a shared_ptr in the // Graph object. - GraphImpl() = delete; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = delete; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; + GraphImpl& operator=(GraphImpl&&) = delete; ~GraphImpl(); - explicit GraphImpl(Kokkos::Experimental::SYCL instance); + explicit GraphImpl(Kokkos::SYCL instance); void add_node(std::shared_ptr const& arg_node_ptr); @@ -63,19 +62,25 @@ class GraphImpl { template void add_predecessor(NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref); - void submit(); + void submit(const Kokkos::SYCL& exec); - Kokkos::Experimental::SYCL const& get_execution_space() const noexcept; + Kokkos::SYCL const& get_execution_space() const noexcept; auto create_root_node_ptr(); template auto create_aggregate_ptr(PredecessorRefs&&...); - private: - void instantiate_graph() { m_graph_exec = m_graph.finalize(); } + void instantiate() { + KOKKOS_EXPECTS(!m_graph_exec.has_value()); + m_graph_exec = m_graph.finalize(); + } - Kokkos::Experimental::SYCL m_execution_space; + auto& sycl_graph() { return m_graph; } + auto& sycl_graph_exec() { return m_graph_exec; } + + private: + Kokkos::SYCL m_execution_space; sycl::ext::oneapi::experimental::command_graph< sycl::ext::oneapi::experimental::graph_state::modifiable> m_graph; @@ -84,17 +89,16 @@ class GraphImpl { m_graph_exec; }; -inline GraphImpl::~GraphImpl() { +inline GraphImpl::~GraphImpl() { m_execution_space.fence("Kokkos::GraphImpl::~GraphImpl: Graph Destruction"); } -inline GraphImpl::GraphImpl( - Kokkos::Experimental::SYCL instance) +inline GraphImpl::GraphImpl(Kokkos::SYCL instance) : m_execution_space(std::move(instance)), m_graph(m_execution_space.sycl_queue().get_context(), m_execution_space.sycl_queue().get_device()) {} -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { // add an empty node that needs to be set up before finalizing the graph arg_node_ptr->node_details_t::node = m_graph.add(); @@ -103,7 +107,7 @@ inline void GraphImpl::add_node( // Requires NodeImplPtr is a shared_ptr to specialization of GraphNodeImpl // Also requires that the kernel has the graph node tag in its policy template -inline void GraphImpl::add_node( +inline void GraphImpl::add_node( std::shared_ptr const& arg_node_ptr) { static_assert(NodeImpl::kernel_type::Policy::is_graph_kernel::value); KOKKOS_EXPECTS(arg_node_ptr); @@ -122,7 +126,7 @@ inline void GraphImpl::add_node( // already been added to this graph and NodeImpl is a specialization of // GraphNodeImpl that has already been added to this graph. template -inline void GraphImpl::add_predecessor( +inline void GraphImpl::add_predecessor( NodeImplPtr arg_node_ptr, PredecessorRef arg_pred_ref) { KOKKOS_EXPECTS(arg_node_ptr); auto pred_ptr = GraphAccess::get_node_ptr(arg_pred_ref); @@ -137,19 +141,19 @@ inline void GraphImpl::add_predecessor( m_graph.make_edge(*pred_node, *node); } -inline void GraphImpl::submit() { +inline void GraphImpl::submit(const Kokkos::SYCL& exec) { if (!m_graph_exec) { - instantiate_graph(); + instantiate(); } - m_execution_space.sycl_queue().ext_oneapi_graph(*m_graph_exec); + exec.sycl_queue().ext_oneapi_graph(*m_graph_exec); } -inline Kokkos::Experimental::SYCL const& -GraphImpl::get_execution_space() const noexcept { +inline Kokkos::SYCL const& GraphImpl::get_execution_space() + const noexcept { return m_execution_space; } -inline auto GraphImpl::create_root_node_ptr() { +inline auto GraphImpl::create_root_node_ptr() { KOKKOS_EXPECTS(!m_graph_exec); auto rv = std::make_shared(get_execution_space(), _graph_node_is_root_ctor_tag{}); @@ -158,7 +162,7 @@ inline auto GraphImpl::create_root_node_ptr() { } template -inline auto GraphImpl::create_aggregate_ptr( +inline auto GraphImpl::create_aggregate_ptr( PredecessorRefs&&...) { // The attachment to predecessors, which is all we really need, happens // in the generic layer, which calls through to add_predecessor for diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp index 5843dca812..5af1330d93 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.cpp @@ -24,14 +24,12 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { namespace { // FIXME_SYCL Should be a multiple of the maximum subgroup size. -static constexpr auto sizeScratchGrain = - sizeof(Kokkos::Experimental::SYCL::size_type[32]); +static constexpr auto sizeScratchGrain = sizeof(Kokkos::SYCL::size_type[32]); std::size_t scratch_count(const std::size_t size) { return (size + sizeScratchGrain - 1) / sizeScratchGrain; @@ -55,8 +53,8 @@ Kokkos::View sycl_global_unique_token_locks( SYCLInternal::~SYCLInternal() { if (!was_finalized || m_scratchSpace || m_scratchHost || m_scratchFlags) { - std::cerr << "Kokkos::Experimental::SYCL ERROR: Failed to call " - "Kokkos::Experimental::SYCL::finalize()" + std::cerr << "Kokkos::SYCL ERROR: Failed to call " + "Kokkos::SYCL::finalize()" << std::endl; std::cerr.flush(); } @@ -64,7 +62,7 @@ SYCLInternal::~SYCLInternal() { int SYCLInternal::verify_is_initialized(const char* const label) const { if (!is_initialized()) { - Kokkos::abort((std::string("Kokkos::Experimental::SYCL::") + label + + Kokkos::abort((std::string("Kokkos::SYCL::") + label + " : ERROR device not initialized\n") .c_str()); } @@ -171,12 +169,12 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( // Multiple ParallelFor/Reduce Teams can call this function at the same time // and invalidate the m_team_scratch_ptr. We use a pool to avoid any race // condition. - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (m_team_scratch_current_size[scratch_pool_id] == 0 && bytes > 0) { m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } if ((bytes > m_team_scratch_current_size[scratch_pool_id]) || ((bytes < m_team_scratch_current_size[scratch_pool_id]) && @@ -184,9 +182,9 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::resize_team_scratch_space( mem_space.deallocate(m_team_scratch_ptr[scratch_pool_id], m_team_scratch_current_size[scratch_pool_id]); m_team_scratch_current_size[scratch_pool_id] = bytes; - m_team_scratch_ptr[scratch_pool_id] = mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalTeamScratchMemory", - m_team_scratch_current_size[scratch_pool_id]); + m_team_scratch_ptr[scratch_pool_id] = + mem_space.allocate("Kokkos::SYCL::InternalTeamScratchMemory", + m_team_scratch_current_size[scratch_pool_id]); } return m_team_scratch_ptr[scratch_pool_id]; } @@ -255,7 +253,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( const std::size_t size) { if (verify_is_initialized("scratch_space") && m_scratchSpaceCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchSpace) mem_space.deallocate(m_scratchSpace, @@ -265,8 +263,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_space( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchSpaceCount, sizeScratchGrain); - m_scratchSpace = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchSpace", alloc_size)); + m_scratchSpace = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchSpace", alloc_size)); } return m_scratchSpace; @@ -276,7 +274,7 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( const std::size_t size) { if (verify_is_initialized("scratch_unified") && m_scratchHostCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLHostUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLHostUSMSpace(*m_queue); if (nullptr != m_scratchHost) mem_space.deallocate(m_scratchHost, @@ -286,8 +284,8 @@ Kokkos::Impl::sycl_host_ptr SYCLInternal::scratch_host( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchHostCount, sizeScratchGrain); - m_scratchHost = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchHost", alloc_size)); + m_scratchHost = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchHost", alloc_size)); } return m_scratchHost; @@ -297,7 +295,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( const std::size_t size) { if (verify_is_initialized("scratch_flags") && m_scratchFlagsCount < scratch_count(size)) { - auto mem_space = Kokkos::Experimental::SYCLDeviceUSMSpace(*m_queue); + auto mem_space = Kokkos::SYCLDeviceUSMSpace(*m_queue); if (nullptr != m_scratchFlags) mem_space.deallocate(m_scratchFlags, @@ -307,8 +305,8 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( std::size_t alloc_size = Kokkos::Impl::multiply_overflow_abort( m_scratchFlagsCount, sizeScratchGrain); - m_scratchFlags = static_cast(mem_space.allocate( - "Kokkos::Experimental::SYCL::InternalScratchFlags", alloc_size)); + m_scratchFlags = static_cast( + mem_space.allocate("Kokkos::SYCL::InternalScratchFlags", alloc_size)); // We only zero-initialize the allocation when we actually allocate. // It's the responsibility of the features using scratch_flags, @@ -326,8 +324,7 @@ Kokkos::Impl::sycl_device_ptr SYCLInternal::scratch_flags( template void SYCLInternal::fence_helper(WAT& wat, const std::string& name, uint32_t instance_id) { - Kokkos::Tools::Experimental::Impl::profile_fence_event< - Kokkos::Experimental::SYCL>( + Kokkos::Tools::Experimental::Impl::profile_fence_event( name, Kokkos::Tools::Experimental::Impl::DirectFenceIDHandle{instance_id}, [&]() { try { @@ -364,8 +361,7 @@ size_t SYCLInternal::USMObjectMem::reserve(size_t n) { AllocationSpace alloc_space(*m_q); if (m_data) alloc_space.deallocate(m_data, m_capacity); - m_data = - alloc_space.allocate("Kokkos::Experimental::SYCL::USMObjectMem", n); + m_data = alloc_space.allocate("Kokkos::SYCL::USMObjectMem", n); if constexpr (sycl::usm::alloc::device == Kind) m_staging.reset(new char[n]); @@ -396,5 +392,4 @@ template class SYCLInternal::USMObjectMem; template class SYCLInternal::USMObjectMem; } // namespace Impl -} // namespace Experimental } // namespace Kokkos diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp index 2d784ef8a5..c982154a9a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Instance.hpp @@ -28,7 +28,6 @@ #include #include namespace Kokkos { -namespace Experimental { namespace Impl { class SYCLInternal { @@ -38,10 +37,10 @@ class SYCLInternal { SYCLInternal() = default; ~SYCLInternal(); - SYCLInternal(const SYCLInternal&) = delete; + SYCLInternal(const SYCLInternal&) = delete; SYCLInternal& operator=(const SYCLInternal&) = delete; - SYCLInternal& operator=(SYCLInternal&&) = delete; - SYCLInternal(SYCLInternal&&) = delete; + SYCLInternal& operator=(SYCLInternal&&) = delete; + SYCLInternal(SYCLInternal&&) = delete; Kokkos::Impl::sycl_device_ptr scratch_space(const std::size_t size); Kokkos::Impl::sycl_device_ptr scratch_flags(const std::size_t size); @@ -76,8 +75,9 @@ class SYCLInternal { mutable sycl::event m_team_scratch_event[m_n_team_scratch] = {}; mutable std::mutex m_team_scratch_mutex; - uint32_t m_instance_id = Kokkos::Tools::Experimental::Impl::idForInstance< - Kokkos::Experimental::SYCL>(reinterpret_cast(this)); + uint32_t m_instance_id = + Kokkos::Tools::Experimental::Impl::idForInstance( + reinterpret_cast(this)); std::optional m_queue; // Using std::vector> reveals a compiler bug when @@ -102,9 +102,9 @@ class SYCLInternal { explicit USMObjectMem(sycl::queue q, uint32_t instance_id) noexcept : m_q(std::move(q)), m_instance_id(instance_id) {} - USMObjectMem(USMObjectMem const&) = delete; - USMObjectMem(USMObjectMem&&) = delete; - USMObjectMem& operator=(USMObjectMem&&) = delete; + USMObjectMem(USMObjectMem const&) = delete; + USMObjectMem(USMObjectMem&&) = delete; + USMObjectMem& operator=(USMObjectMem&&) = delete; USMObjectMem& operator=(USMObjectMem const&) = delete; ~USMObjectMem() { reset(); }; @@ -119,12 +119,12 @@ class SYCLInternal { size_t reserve(size_t n); private: - using AllocationSpace = std::conditional_t< - Kind == sycl::usm::alloc::device, - Kokkos::Experimental::SYCLDeviceUSMSpace, - std::conditional_t>; + using AllocationSpace = + std::conditional_t>; public: // Performs either sycl::memcpy (for USM device memory) or std::memcpy @@ -144,11 +144,10 @@ class SYCLInternal { } void fence() { - SYCLInternal::fence( - m_last_event, - "Kokkos::Experimental::SYCLInternal::USMObject fence to wait for " - "last event to finish", - m_instance_id); + SYCLInternal::fence(m_last_event, + "Kokkos::SYCLInternal::USMObject fence to wait for " + "last event to finish", + m_instance_id); } void register_event(sycl::event event) { @@ -324,13 +323,12 @@ auto make_sycl_function_wrapper(const Functor& functor, Storage& storage) { return SYCLFunctionWrapper(functor, storage); } } // namespace Impl -} // namespace Experimental } // namespace Kokkos #if defined(SYCL_DEVICE_COPYABLE) && defined(KOKKOS_ARCH_INTEL_GPU) template struct sycl::is_device_copyable< - Kokkos::Experimental::Impl::SYCLFunctionWrapper> + Kokkos::Impl::SYCLFunctionWrapper> : std::true_type {}; #if (defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER < 20240000) || \ @@ -352,8 +350,7 @@ static_assert( template struct sycl::is_device_copyable< - const Kokkos::Experimental::Impl::SYCLFunctionWrapper, + const Kokkos::Impl::SYCLFunctionWrapper, std::enable_if_t>>> : std::true_type {}; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp index d212e2dacc..9498513a3e 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_MDRangePolicy.hpp @@ -22,13 +22,13 @@ namespace Kokkos { template <> -struct default_outer_direction { +struct default_outer_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; template <> -struct default_inner_direction { +struct default_inner_direction { using type = Iterate; static constexpr Iterate value = Iterate::Left; }; @@ -37,8 +37,8 @@ namespace Impl { // Settings for MDRangePolicy template <> -inline TileSizeProperties get_tile_size_properties( - const Kokkos::Experimental::SYCL& space) { +inline TileSizeProperties get_tile_size_properties( + const Kokkos::SYCL& space) { TileSizeProperties properties; properties.max_threads = space.impl_internal_space_instance()->m_maxWorkgroupSize; @@ -50,8 +50,7 @@ inline TileSizeProperties get_tile_size_properties( // Settings for TeamMDRangePolicy template -struct ThreadAndVectorNestLevel +struct ThreadAndVectorNestLevel : AcceleratorBasedNestLevel {}; } // namespace Impl diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp index cb7b1048da..3dbd63d81a 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_MDRange.hpp @@ -25,7 +25,7 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; @@ -54,7 +54,7 @@ class Kokkos::Impl::ParallelFor, const typename Policy::index_type m_num_tiles; static constexpr Iterate inner_direction = Policy::inner_direction; } m_policy; - const Kokkos::Experimental::SYCL& m_space; + const Kokkos::SYCL& m_space; sycl::nd_range<3> compute_ranges() const { const auto& m_tile = m_policy.m_tile; @@ -180,12 +180,11 @@ class Kokkos::Impl::ParallelFor, } void execute() const { - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = - m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_space.impl_internal_space_instance()->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp index 8ef43d392c..da75f3e901 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Range.hpp @@ -67,7 +67,7 @@ struct FunctorWrapperRangePolicyParallelForCustom { template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; @@ -82,8 +82,8 @@ class Kokkos::Impl::ParallelFor, sycl::event sycl_direct_launch(const Policy& policy, const Functor& functor, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -160,13 +160,13 @@ class Kokkos::Impl::ParallelFor, void execute() const { if (m_policy.begin() == m_policy.end()) return; - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = m_policy.space() - .impl_internal_space_instance() - ->get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + m_policy.space() + .impl_internal_space_instance() + ->get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_wrapper, functor_wrapper.get_copy_event()); functor_wrapper.register_event(event); diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp index cf7f582bc7..d8859cda9f 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelFor_Team.hpp @@ -27,11 +27,11 @@ template class Kokkos::Impl::ParallelFor, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using functor_type = FunctorType; - using size_type = ::Kokkos::Experimental::SYCL::size_type; + using size_type = ::Kokkos::SYCL::size_type; private: using member_type = typename Policy::member_type; @@ -52,8 +52,8 @@ class Kokkos::Impl::ParallelFor, const FunctorWrapper& functor_wrapper, const sycl::event& memcpy_event) const { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - sycl::queue& q = space.sycl_queue(); + const Kokkos::SYCL& space = m_policy.space(); + sycl::queue& q = space.sycl_queue(); desul::ensure_sycl_lock_arrays_on_device(q); @@ -146,11 +146,11 @@ class Kokkos::Impl::ParallelFor, scratch_pool_id, static_cast(m_scratch_size[1]) * m_league_size)); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_wrapper, functor_wrapper.get_copy_event()); @@ -164,10 +164,14 @@ class Kokkos::Impl::ParallelFor, m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended(arg_functor, ParallelForTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + } m_shmem_begin = (sizeof(double) * (m_team_size + 2)); m_shmem_size = diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp index 0774b24bca..1e31354975 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_MDRange.hpp @@ -30,7 +30,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = Kokkos::MDRangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -76,7 +76,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -85,7 +85,7 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch( functor_reducer_wrapper, functor_reducer_wrapper.get_copy_event()); @@ -370,7 +368,7 @@ class Kokkos::Impl::ParallelReduce template -class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { +class Kokkos::Impl::ParallelReduce< + CombinedFunctorReducerType, Kokkos::RangePolicy, Kokkos::SYCL> { public: using Policy = Kokkos::RangePolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -49,7 +48,7 @@ class Kokkos::Impl::ParallelReduce::accessible) {} private: @@ -59,8 +58,8 @@ class Kokkos::Impl::ParallelReduce scratch_buffers_lock( instance.m_mutexScratchSpace); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(m_policy, functor_reducer_wrapper, diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp index b443bcbf90..8f5310cbb2 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelReduce_Team.hpp @@ -29,7 +29,7 @@ template class Kokkos::Impl::ParallelReduce, - Kokkos::Experimental::SYCL> { + Kokkos::SYCL> { public: using Policy = TeamPolicy; using FunctorType = typename CombinedFunctorReducerType::functor_type; @@ -46,7 +46,7 @@ class Kokkos::Impl::ParallelReduce(m_scratch_size[1]) * m_league_size)); - using IndirectKernelMem = - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem; + using IndirectKernelMem = Kokkos::Impl::SYCLInternal::IndirectKernelMem; IndirectKernelMem& indirectKernelMem = instance.get_indirect_kernel_mem(); auto functor_reducer_wrapper = - Experimental::Impl::make_sycl_function_wrapper(m_functor_reducer, - indirectKernelMem); + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(global_scratch_ptr, functor_reducer_wrapper, @@ -436,16 +434,21 @@ class Kokkos::Impl::ParallelReduce::accessible), m_league_size(arg_policy.league_size()), m_team_size(arg_policy.team_size()), m_vector_size(arg_policy.impl_vector_length()) { - // FIXME_SYCL optimize - if (m_team_size < 0) + if (m_team_size < 0) { m_team_size = m_policy.team_size_recommended( m_functor_reducer.get_functor(), m_functor_reducer.get_reducer(), ParallelReduceTag{}); + if (m_team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelReduce could not find a " + "valid execution configuration."); + } + // Must be a power of two greater than two, get the one not bigger than the // requested one. if ((m_team_size & m_team_size - 1) || m_team_size < 2) { @@ -461,7 +464,7 @@ class Kokkos::Impl::ParallelReduce(instance.m_maxShmemPerBlock) < m_shmem_size - m_shmem_begin) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp index bdb5b88377..ed7cee2805 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ParallelScan_Range.hpp @@ -145,7 +145,7 @@ class ParallelScanSYCLBase { using value_type = typename Analysis::value_type; using reference_type = typename Analysis::reference_type; using functor_type = FunctorType; - using size_type = Kokkos::Experimental::SYCL::size_type; + using size_type = Kokkos::SYCL::size_type; using index_type = typename Policy::index_type; protected: @@ -161,8 +161,8 @@ class ParallelScanSYCLBase { sycl::event sycl_direct_launch(const FunctorWrapper& functor_wrapper, sycl::event memcpy_event) { // Convenience references - const Kokkos::Experimental::SYCL& space = m_policy.space(); - Kokkos::Experimental::Impl::SYCLInternal& instance = + const Kokkos::SYCL& space = m_policy.space(); + Kokkos::Impl::SYCLInternal& instance = *space.impl_internal_space_instance(); sycl::queue& q = space.sycl_queue(); @@ -374,11 +374,11 @@ class ParallelScanSYCLBase { std::scoped_lock scratch_buffers_lock( instance.m_mutexScratchSpace); - Kokkos::Experimental::Impl::SYCLInternal::IndirectKernelMem& - indirectKernelMem = instance.get_indirect_kernel_mem(); + Kokkos::Impl::SYCLInternal::IndirectKernelMem& indirectKernelMem = + instance.get_indirect_kernel_mem(); - auto functor_wrapper = Experimental::Impl::make_sycl_function_wrapper( - m_functor_reducer, indirectKernelMem); + auto functor_wrapper = + Impl::make_sycl_function_wrapper(m_functor_reducer, indirectKernelMem); sycl::event event = sycl_direct_launch(functor_wrapper, functor_wrapper.get_copy_event()); @@ -399,7 +399,7 @@ class ParallelScanSYCLBase { template class Kokkos::Impl::ParallelScan, - Kokkos::Experimental::SYCL> + Kokkos::SYCL> : private ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; @@ -417,13 +417,12 @@ class Kokkos::Impl::ParallelScan, template class Kokkos::Impl::ParallelScanWithTotal< - FunctorType, Kokkos::RangePolicy, ReturnType, - Kokkos::Experimental::SYCL> + FunctorType, Kokkos::RangePolicy, ReturnType, Kokkos::SYCL> : public ParallelScanSYCLBase { public: using Base = ParallelScanSYCLBase; - const Kokkos::Experimental::SYCL& m_exec; + const Kokkos::SYCL& m_exec; inline void execute() { Base::impl_execute([&]() { @@ -445,7 +444,7 @@ class Kokkos::Impl::ParallelScanWithTotal< const typename Base::Policy& arg_policy, const ViewType& arg_result_view) : Base(arg_functor, arg_policy, arg_result_view.data(), - MemorySpaceAccess::accessible), m_exec(arg_policy.space()) {} }; diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp index 19fad29150..022f88e0a8 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.cpp @@ -33,11 +33,11 @@ namespace Kokkos { namespace Impl { void DeepCopySYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); } -void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, - const void* src, size_t n) { +void DeepCopyAsyncSYCL(const Kokkos::SYCL& instance, void* dst, const void* src, + size_t n) { sycl::queue& q = *instance.impl_internal_space_instance()->m_queue; auto event = q.memcpy(dst, src, n); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES @@ -46,9 +46,8 @@ void DeepCopyAsyncSYCL(const Kokkos::Experimental::SYCL& instance, void* dst, } void DeepCopyAsyncSYCL(void* dst, const void* src, size_t n) { - Experimental::Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); - Experimental::SYCL().fence( - "Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); + Impl::SYCLInternal::singleton().m_queue->memcpy(dst, src, n); + SYCL().fence("Kokkos::Impl::DeepCopyAsyncSYCL: fence after memcpy"); } } // namespace Impl @@ -60,12 +59,9 @@ namespace { std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { switch (allocation_kind) { - case sycl::usm::alloc::host: - return Kokkos::Experimental::SYCLHostUSMSpace::name(); - case sycl::usm::alloc::device: - return Kokkos::Experimental::SYCLDeviceUSMSpace::name(); - case sycl::usm::alloc::shared: - return Kokkos::Experimental::SYCLSharedUSMSpace::name(); + case sycl::usm::alloc::host: return Kokkos::SYCLHostUSMSpace::name(); + case sycl::usm::alloc::device: return Kokkos::SYCLDeviceUSMSpace::name(); + case sycl::usm::alloc::shared: return Kokkos::SYCLSharedUSMSpace::name(); default: Kokkos::abort("bug: unknown sycl allocation type"); return "unreachable"; @@ -75,7 +71,6 @@ std::string_view get_memory_space_name(sycl::usm::alloc allocation_kind) { } // namespace namespace Kokkos { -namespace Experimental { SYCLDeviceUSMSpace::SYCLDeviceUSMSpace() : m_queue(*SYCL().impl_internal_space_instance()->m_queue) {} @@ -114,12 +109,12 @@ void* allocate_sycl(const char* arg_label, const size_t arg_alloc_size, return hostPtr; } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const size_t arg_alloc_size) const { return allocate(exec_space, "[unlabeled]", arg_alloc_size); } -void* SYCLDeviceUSMSpace::allocate(const Kokkos::Experimental::SYCL& exec_space, +void* SYCLDeviceUSMSpace::allocate(const Kokkos::SYCL& exec_space, const char* arg_label, const size_t arg_alloc_size, const size_t arg_logical_size) const { @@ -244,7 +239,6 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, Kokkos::Tools::make_space_handle(name()), m_queue); } -} // namespace Experimental } // namespace Kokkos //============================================================================== @@ -253,11 +247,11 @@ void SYCLHostUSMSpace::deallocate(const char* arg_label, #include KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); + Kokkos::SYCLDeviceUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLSharedUSMSpace); + Kokkos::SYCLSharedUSMSpace); KOKKOS_IMPL_SHARED_ALLOCATION_RECORD_EXPLICIT_INSTANTIATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLHostUSMSpace); // end Explicit instantiations of CRTP Base classes }}}1 //============================================================================== diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp index b86cfca413..5a37da130c 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Space.hpp @@ -39,8 +39,6 @@ template struct is_sycl_type_space : public std::false_type {}; } // namespace Impl -namespace Experimental { - class SYCLDeviceUSMSpace { public: using execution_space = SYCL; @@ -154,45 +152,40 @@ class SYCLHostUSMSpace { sycl::queue m_queue; }; -} // namespace Experimental - namespace Impl { template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type { +}; template <> -struct is_sycl_type_space - : public std::true_type {}; +struct is_sycl_type_space : public std::true_type {}; -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLSharedUSMSpace, - Kokkos::Experimental::SYCLSharedUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); -static_assert(Kokkos::Impl::MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::Experimental::SYCLDeviceUSMSpace>::assignable); +static_assert( + Kokkos::Impl::MemorySpaceAccess::assignable); template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space != SYCLSharedUSMSpace::execution_space enum : bool { assignable = false }; enum : bool { accessible = true }; @@ -200,26 +193,24 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { // HostSpace::execution_space == - // Experimental::SYCLHostUSMSpace::execution_space + // SYCLHostUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLDeviceUSMSpace::execution_space == SYCLSharedUSMSpace::execution_space enum : bool { assignable = true }; enum : bool { accessible = true }; @@ -227,14 +218,11 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLDeviceUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLDeviceUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLDeviceUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLDeviceUSMSpace::execution_space enum : bool { deepcopy = true }; }; @@ -243,16 +231,15 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; enum : bool { accessible = false }; // SYCL cannot access HostSpace enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { // SYCLSharedUSMSpace::execution_space == SYCLDeviceUSMSpace::execution_space // Can access SYCLSharedUSMSpace from Host but cannot access // SYCLDeviceUSMSpace from Host @@ -264,47 +251,38 @@ struct MemorySpaceAccess -struct MemorySpaceAccess { - // Experimental::SYCLSharedUSMSpace::execution_space != - // Experimental::SYCLHostUSMSpace::execution_space +struct MemorySpaceAccess { + // SYCLSharedUSMSpace::execution_space != + // SYCLHostUSMSpace::execution_space enum : bool { assignable = false }; - enum : bool { - accessible = true - }; // Experimental::SYCLSharedUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLSharedUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from SYCL - enum : bool { - accessible = true - }; // Experimental::SYCLHostUSMSpace::execution_space + enum : bool { accessible = true }; // SYCLHostUSMSpace::execution_space enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // Cannot access from Host enum : bool { accessible = false }; enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess { +struct MemorySpaceAccess { enum : bool { assignable = false }; // different execution_space enum : bool { accessible = true }; // same accessibility enum : bool { deepcopy = true }; }; template <> -struct MemorySpaceAccess< - Kokkos::Experimental::SYCLDeviceUSMSpace, - Kokkos::ScratchMemorySpace> { +struct MemorySpaceAccess> { enum : bool { assignable = false }; enum : bool { accessible = true }; enum : bool { deepcopy = false }; @@ -315,11 +293,9 @@ struct MemorySpaceAccess< } // namespace Kokkos KOKKOS_IMPL_HOST_INACCESSIBLE_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLDeviceUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLSharedUSMSpace); -KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION( - Kokkos::Experimental::SYCLHostUSMSpace); + Kokkos::SYCLDeviceUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLSharedUSMSpace); +KOKKOS_IMPL_SHARED_ALLOCATION_SPECIALIZATION(Kokkos::SYCLHostUSMSpace); #endif #endif diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp index 1e42faa5a8..6359e4a2d9 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_Team.hpp @@ -34,7 +34,7 @@ namespace Impl { */ class SYCLTeamMember { public: - using execution_space = Kokkos::Experimental::SYCL; + using execution_space = Kokkos::SYCL; using scratch_memory_space = execution_space::scratch_memory_space; using team_handle = SYCLTeamMember; @@ -126,6 +126,20 @@ class SYCLTeamMember { team_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const noexcept { using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const noexcept { + using value_type = typename WrappedReducerType::value_type; auto sg = m_item.get_sub_group(); const auto sub_group_range = sg.get_local_range()[0]; @@ -139,7 +153,7 @@ class SYCLTeamMember { if (vector_range * shift < sub_group_range) { const value_type tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } }; shuffle_combine(1); @@ -153,14 +167,13 @@ class SYCLTeamMember { shift <<= 1) { auto tmp = Kokkos::Impl::SYCLReduction::shift_group_left( sg, value, vector_range * shift); - if (team_rank_ + shift < team_size_) reducer.join(value, tmp); + if (team_rank_ + shift < team_size_) wrapped_reducer.join(&value, &tmp); } #endif value = Kokkos::Impl::SYCLReduction::select_from_group(sg, value, 0); const int n_subgroups = sg.get_group_range()[0]; if (n_subgroups == 1) { - reducer.reference() = value; return; } @@ -187,16 +200,15 @@ class SYCLTeamMember { for (int start = step_width; start < n_subgroups; start += step_width) { if (id_in_sg == 0 && group_id >= start && group_id < std::min(start + step_width, n_subgroups)) - reducer.join(reduction_array[group_id - start], value); + wrapped_reducer.join(&reduction_array[group_id - start], &value); sycl::group_barrier(m_item.get_group()); } // Do the final reduction for all threads redundantly value = reduction_array[0]; for (int i = 1; i < std::min(step_width, n_subgroups); ++i) - reducer.join(value, reduction_array[i]); + wrapped_reducer.join(&value, &reduction_array[i]); - reducer.reference() = value; // Make sure that every thread is done using the reduction array. sycl::group_barrier(m_item.get_group()); } @@ -271,8 +283,8 @@ class SYCLTeamMember { const auto update = Kokkos::Impl::SYCLReduction::shift_group_right(sg, value, vector_range); - Type intermediate = (group_id > 0 ? base_data[group_id - 1] : 0) + - (id_in_sg >= vector_range ? update : 0); + Type intermediate = (group_id > 0 ? base_data[group_id - 1] : Type{0}) + + (id_in_sg >= vector_range ? update : Type{0}); if (global_accum) { if (id_in_sg == sub_group_range - 1 && @@ -311,6 +323,19 @@ class SYCLTeamMember { KOKKOS_INLINE_FUNCTION std::enable_if_t::value> vector_reduce(ReducerType const& reducer, typename ReducerType::value_type& value) const { + using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = + typename Impl::FunctorAnalysis, ReducerType, + value_type>::Reducer; + impl_vector_reduce(wrapped_reducer_type(reducer), value); + reducer.reference() = value; + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_vector_reduce(WrappedReducerType const& wrapped_reducer, + typename WrappedReducerType::value_type& value) const { const auto tidx1 = m_item.get_local_id(1); const auto grange1 = m_item.get_local_range(1); @@ -319,13 +344,13 @@ class SYCLTeamMember { if (grange1 == 1) return; // Intra vector lane shuffle reduction: - typename ReducerType::value_type tmp(value); - typename ReducerType::value_type tmp2 = tmp; + typename WrappedReducerType::value_type tmp(value); + typename WrappedReducerType::value_type tmp2 = tmp; for (int i = grange1; (i >>= 1);) { tmp2 = Kokkos::Impl::SYCLReduction::shift_group_left(sg, tmp, i); if (static_cast(tidx1) < i) { - reducer.join(tmp, tmp2); + wrapped_reducer.join(&tmp, &tmp2); } } @@ -336,8 +361,7 @@ class SYCLTeamMember { tmp2 = Kokkos::Impl::SYCLReduction::select_from_group( sg, tmp, (sg.get_local_id() / grange1) * grange1); - value = tmp2; - reducer.reference() = tmp2; + value = tmp2; } //---------------------------------------- @@ -531,8 +555,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); @@ -541,7 +573,9 @@ parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< closure(i, value); } - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel_reduce assuming summation. @@ -557,20 +591,28 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start + loop_boundaries.member.item().get_local_id(0); i < loop_boundaries.end; i += loop_boundaries.member.item().get_local_range(0)) { - closure(i, val); + closure(i, value); } - loop_boundaries.member.team_reduce(reducer, val); - result = reducer.reference(); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } /** \brief Inter-thread parallel exclusive prefix sum. @@ -657,8 +699,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -670,8 +720,11 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< i < loop_boundaries.end; i += grange0 * grange1) closure(i, value); - loop_boundaries.member.vector_reduce(reducer, value); - loop_boundaries.member.team_reduce(reducer, value); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + reducer.reference() = value; } template @@ -679,10 +732,16 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember>& loop_boundaries, const Closure& closure, ValueType& result) { - ValueType val; - Kokkos::Sum reducer(val); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; - reducer.init(reducer.reference()); + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx0 = loop_boundaries.member.item().get_local_id(0); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); @@ -692,11 +751,13 @@ parallel_reduce(const Impl::TeamVectorRangeBoundariesStruct< for (iType i = loop_boundaries.start + tidx0 * grange1 + tidx1; i < loop_boundaries.end; i += grange0 * grange1) - closure(i, val); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); - loop_boundaries.member.team_reduce(reducer); - result = reducer.reference(); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + loop_boundaries.member.impl_team_reduce(wrapped_reducer, value); + + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- @@ -746,16 +807,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ReducerType const& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, ReducerType, + value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const iType grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, reducer.reference()); + closure(i, value); - loop_boundaries.member.vector_reduce(reducer); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Intra-thread vector parallel_reduce. @@ -774,16 +846,27 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::SYCLTeamMember> const& loop_boundaries, Closure const& closure, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Closure, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(closure); + value_type value; + wrapped_reducer.init(&value); const iType tidx1 = loop_boundaries.member.item().get_local_id(1); const int grange1 = loop_boundaries.member.item().get_local_range(1); for (iType i = loop_boundaries.start + tidx1; i < loop_boundaries.end; i += grange1) - closure(i, result); + closure(i, value); - loop_boundaries.member.vector_reduce(Kokkos::Sum(result)); + loop_boundaries.member.impl_vector_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp index 17ce59058b..556ca0d281 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_TeamPolicy.hpp @@ -22,8 +22,7 @@ #include template -class Kokkos::Impl::TeamPolicyInternal +class Kokkos::Impl::TeamPolicyInternal : public PolicyTraits { public: using execution_policy = TeamPolicyInternal; @@ -45,7 +44,7 @@ class Kokkos::Impl::TeamPolicyInternal TeamPolicyInternal(TeamPolicyInternal const& p) { diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp index d55fc6a84b..79d9e8a8d4 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_UniqueToken.hpp @@ -22,13 +22,14 @@ #include namespace Kokkos { -namespace Experimental { namespace Impl { Kokkos::View sycl_global_unique_token_locks( bool deallocate = false); } +namespace Experimental { + // both global and instance Unique Tokens are implemented in the same way // the global version has one shared static lock array underneath // but it can't be a static member variable since we need to acces it on device @@ -42,7 +43,7 @@ class UniqueToken { using size_type = int32_t; explicit UniqueToken(execution_space const& = execution_space()) - : m_locks(Impl::sycl_global_unique_token_locks()) {} + : m_locks(Kokkos::Impl::sycl_global_unique_token_locks()) {} KOKKOS_DEFAULTED_FUNCTION UniqueToken(const UniqueToken&) = default; @@ -75,11 +76,15 @@ class UniqueToken { /// \brief acquire value such that 0 <= value < size() KOKKOS_INLINE_FUNCTION size_type impl_acquire() const { +#if defined(__INTEL_LLVM_COMPILER) && __INTEL_LLVM_COMPILER >= 20250000 + auto item = sycl::ext::oneapi::this_work_item::get_nd_item<3>(); +#else auto item = sycl::ext::oneapi::experimental::this_nd_item<3>(); +#endif std::size_t threadIdx[3] = {item.get_local_id(2), item.get_local_id(1), item.get_local_id(0)}; std::size_t blockIdx[3] = {item.get_group(2), item.get_group(1), - item.get_group(0)}; + item.get_group(0)}; std::size_t blockDim[3] = {item.get_local_range(2), item.get_local_range(1), item.get_local_range(0)}; @@ -122,11 +127,11 @@ class UniqueToken public: UniqueToken() : UniqueToken( - Kokkos::Experimental::SYCL().concurrency()) {} + Kokkos::SYCL().concurrency()) {} explicit UniqueToken(execution_space const& arg) : UniqueToken( - Kokkos::Experimental::SYCL().concurrency(), arg) {} + Kokkos::SYCL().concurrency(), arg) {} explicit UniqueToken(size_type max_size) : UniqueToken(max_size) {} diff --git a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp index 61db6b34aa..2905733a4d 100644 --- a/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp +++ b/lib/kokkos/core/src/SYCL/Kokkos_SYCL_ZeroMemset.hpp @@ -23,12 +23,11 @@ namespace Kokkos { namespace Impl { -template -struct ZeroMemset> { - ZeroMemset(const Kokkos::Experimental::SYCL& exec_space, - const View& dst) { - auto event = exec_space.impl_internal_space_instance()->m_queue->memset( - dst.data(), 0, dst.size() * sizeof(typename View::value_type)); +template <> +struct ZeroMemset { + ZeroMemset(const Kokkos::SYCL& exec_space, void* dst, size_t cnt) { + auto event = + exec_space.impl_internal_space_instance()->m_queue->memset(dst, 0, cnt); #ifndef KOKKOS_IMPL_SYCL_USE_IN_ORDER_QUEUES exec_space.impl_internal_space_instance() ->m_queue->ext_oneapi_submit_barrier(std::vector{event}); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp index 81d43b31b3..a1fa9e43e0 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial.hpp @@ -34,7 +34,6 @@ static_assert(false, #include #include #include -#include #include #include #include @@ -267,7 +266,7 @@ template std::vector partition_space(const Serial&, std::vector const& weights) { static_assert( - std::is_arithmetic::value, + std::is_arithmetic_v, "Kokkos Error: partitioning arguments must be integers or floats"); // We only care about the number of instances to create and ignore weights @@ -284,7 +283,9 @@ std::vector partition_space(const Serial&, #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #endif // defined( KOKKOS_ENABLE_SERIAL ) diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp index 34e115eca9..addcaba009 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_MDRange.hpp @@ -44,11 +44,16 @@ class ParallelFor, public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_iter.m_rp.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->exec(); } template @@ -112,10 +117,15 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp index 80faec9041..2ab7b7f803 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Range.hpp @@ -31,7 +31,7 @@ class ParallelFor, Kokkos::Serial> { const Policy m_policy; template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { m_functor(i); @@ -39,7 +39,7 @@ class ParallelFor, Kokkos::Serial> { } template - std::enable_if_t::value> exec() const { + std::enable_if_t> exec() const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -49,10 +49,15 @@ class ParallelFor, Kokkos::Serial> { public: inline void execute() const { + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads auto* internal_instance = m_policy.space().impl_internal_space_instance(); std::lock_guard lock(internal_instance->m_instance_mutex); +#endif this->template exec(); } @@ -79,7 +84,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -88,7 +93,7 @@ class ParallelReduce, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; @@ -108,10 +113,15 @@ class ParallelReduce, auto* internal_instance = m_policy.space().impl_internal_space_instance(); + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, thread_local_size); @@ -166,7 +176,7 @@ class ParallelScan, const Policy m_policy; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -175,7 +185,7 @@ class ParallelScan, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -194,10 +204,16 @@ class ParallelScan, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -235,7 +251,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const typename Policy::member_type e = m_policy.end(); for (typename Policy::member_type i = m_policy.begin(); i < e; ++i) { @@ -244,7 +260,7 @@ class ParallelScanWithTotal, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( reference_type update) const { const TagType t{}; const typename Policy::member_type e = m_policy.end(); @@ -262,10 +278,16 @@ class ParallelScanWithTotal, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp index a523cc86c9..7a6faf3d9f 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Parallel_Team.hpp @@ -223,7 +223,7 @@ class ParallelFor, const size_t m_shared; template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor(Member(data, ileague, m_league)); @@ -231,7 +231,7 @@ class ParallelFor, } template - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data) const { const TagType t{}; for (int ileague = 0; ileague < m_league; ++ileague) { @@ -247,10 +247,16 @@ class ParallelFor, const size_t thread_local_size = 0; // Never shrinks auto* internal_instance = m_policy.space().impl_internal_space_instance(); + + // caused a possibly codegen-related slowdown, especially in GCC 9-11 + // with KOKKOS_ARCH_NATIVE + // https://github.com/kokkos/kokkos/issues/7268 +#ifndef KOKKOS_ENABLE_ATOMICS_BYPASS // Make sure kernels are running sequentially even when using multiple // threads, lock resize_thread_team_data std::lock_guard instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, @@ -293,7 +299,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { for (int ileague = 0; ileague < m_league; ++ileague) { m_functor_reducer.get_functor()(Member(data, ileague, m_league), update); @@ -301,7 +307,7 @@ class ParallelReduce - inline std::enable_if_t::value> exec( + inline std::enable_if_t> exec( HostThreadTeamData& data, reference_type update) const { const TagType t{}; @@ -321,10 +327,16 @@ class ParallelReduce instance_lock( internal_instance->m_instance_mutex); +#endif internal_instance->resize_thread_team_data( pool_reduce_size, team_reduce_size, team_shared_size, diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp index 5905d6d32e..678d182504 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_Task.hpp @@ -25,10 +25,16 @@ #include #include #include +#include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- +#ifdef KOKKOS_ENABLE_DEPRECATION_WARNINGS +// We allow using deprecated classes in this file +KOKKOS_IMPL_DISABLE_DEPRECATED_WARNINGS_PUSH() +#endif + namespace Kokkos { namespace Impl { @@ -102,9 +108,8 @@ class TaskQueueSpecialization> { template class TaskQueueSpecializationConstrained< - Scheduler, - std::enable_if_t::value>> { + Scheduler, std::enable_if_t>> { public: // Note: Scheduler may be an incomplete type at class scope (but not inside // of the methods, obviously) @@ -215,6 +220,10 @@ extern template class TaskQueue, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp index 6ad6aabc5a..527e094079 100644 --- a/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp +++ b/lib/kokkos/core/src/Serial/Kokkos_Serial_ZeroMemset.hpp @@ -31,15 +31,11 @@ namespace Impl { // parallel execution space since the specialization for // DefaultHostExecutionSpace is defined elsewhere. struct DummyExecutionSpace; -template +template <> struct ZeroMemset< - std::conditional_t::value, - Serial, DummyExecutionSpace>, - View> { - ZeroMemset(const Serial&, const View& dst) { - using ValueType = typename View::value_type; - std::memset(dst.data(), 0, sizeof(ValueType) * dst.size()); - } + std::conditional_t, + Serial, DummyExecutionSpace>> { + ZeroMemset(const Serial&, void* dst, size_t cnt) { std::memset(dst, 0, cnt); } }; } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp index 3842966cd7..edc9489f67 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.cpp @@ -67,8 +67,9 @@ std::pair int s_thread_pool_size[3] = {0, 0, 0}; -void (*volatile s_current_function)(ThreadsInternal &, const void *); -const void *volatile s_current_function_arg = nullptr; +using s_current_function_type = void (*)(ThreadsInternal &, const void *); +std::atomic s_current_function; +std::atomic s_current_function_arg = nullptr; inline unsigned fan_size(const unsigned rank, const unsigned size) { const unsigned rank_rev = size - (rank + 1); @@ -79,7 +80,7 @@ inline unsigned fan_size(const unsigned rank, const unsigned size) { return count; } -void wait_yield(volatile ThreadState &flag, const ThreadState value) { +void wait_yield(std::atomic &flag, const ThreadState value) { while (value == flag) { std::this_thread::yield(); } @@ -135,11 +136,12 @@ ThreadsInternal::ThreadsInternal() ThreadsInternal *const nil = nullptr; // Which entry in 's_threads_exec', possibly determined from hwloc binding - const int entry = reinterpret_cast(s_current_function_arg) < - size_t(s_thread_pool_size[0]) - ? reinterpret_cast(s_current_function_arg) - : size_t(Kokkos::hwloc::bind_this_thread( - s_thread_pool_size[0], s_threads_coord)); + const int entry = + reinterpret_cast(s_current_function_arg.load()) < + size_t(s_thread_pool_size[0]) + ? reinterpret_cast(s_current_function_arg.load()) + : size_t(Kokkos::hwloc::bind_this_thread(s_thread_pool_size[0], + s_threads_coord)); // Given a good entry set this thread in the 's_threads_exec' array if (entry < s_thread_pool_size[0] && @@ -543,7 +545,7 @@ void ThreadsInternal::initialize(int thread_count_arg) { for (unsigned ith = 1; ith < thread_count; ++ith) { // Try to protect against cache coherency failure by casting to volatile. ThreadsInternal *const th = - ((ThreadsInternal * volatile *)s_threads_exec)[ith]; + ((ThreadsInternal *volatile *)s_threads_exec)[ith]; if (th) { wait_yield(th->m_pool_state, ThreadState::Active); } else { diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp index a5eb231cb0..130b3433d0 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Instance.hpp @@ -60,7 +60,7 @@ class ThreadsInternal { int m_pool_rank_rev; int m_pool_size; int m_pool_fan_size; - ThreadState volatile m_pool_state; ///< State for global synchronizations + std::atomic m_pool_state; ///< State for global synchronizations // Members for dynamic scheduling // Which thread am I stealing from currently @@ -96,7 +96,7 @@ class ThreadsInternal { return reinterpret_cast(m_scratch) + m_scratch_reduce_end; } - KOKKOS_INLINE_FUNCTION ThreadState volatile &state() { return m_pool_state; } + KOKKOS_INLINE_FUNCTION auto &state() { return m_pool_state; } KOKKOS_INLINE_FUNCTION ThreadsInternal *const *pool_base() const { return m_pool_base; } @@ -225,7 +225,7 @@ class ThreadsInternal { // to inactive triggers another thread to exit a spinwait // and read the 'reduce_memory'. // Must 'memory_fence()' to guarantee that storing the update to - // 'reduce_memory()' will complete before storing the the update to + // 'reduce_memory()' will complete before storing the update to // 'm_pool_state'. memory_fence(); @@ -403,7 +403,7 @@ class ThreadsInternal { static void start(void (*)(ThreadsInternal &, const void *), const void *); #ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 - KOKKOS_DEPRECATED static int in_parallel(); + static int in_parallel(); #endif static void fence(); static void fence(const std::string &); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp index 59577609ab..711b1b6926 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_MDRange.hpp @@ -51,7 +51,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -65,7 +65,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp index 4a89c4fad8..25aab9ebfb 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Range.hpp @@ -35,7 +35,7 @@ class ParallelFor, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ defined(KOKKOS_ENABLE_PRAGMA_IVDEP) @@ -47,7 +47,7 @@ class ParallelFor, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member ibeg, const Member iend) { const TagType t{}; #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -64,7 +64,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); @@ -77,7 +77,7 @@ class ParallelFor, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelFor &self = *((const ParallelFor *)arg); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp index f927d7c6a6..40be3884c3 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelFor_Team.hpp @@ -36,8 +36,8 @@ class ParallelFor, const size_t m_shared; template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_static(); member.next_static()) { functor(member); @@ -45,8 +45,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -55,8 +55,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { for (; member.valid_dynamic(); member.next_dynamic()) { functor(member); @@ -64,8 +64,8 @@ class ParallelFor, } template - inline static std::enable_if_t::value && - std::is_same::value> + inline static std::enable_if_t && + std::is_same_v> exec_team(const FunctorType &functor, Member member) { const TagType t{}; for (; member.valid_dynamic(); member.next_dynamic()) { @@ -88,8 +88,12 @@ class ParallelFor, policy.impl_set_vector_length(1); } if (policy.team_size() < 0) { - policy.impl_set_team_size( - policy.team_size_recommended(m_functor, ParallelForTag{})); + int team_size = policy.team_size_recommended(m_functor, ParallelForTag{}); + if (team_size <= 0) + Kokkos::Impl::throw_runtime_exception( + "Kokkos::Impl::ParallelFor could not find a " + "valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp index fa63215a9e..9f28f9bbfc 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_MDRange.hpp @@ -59,7 +59,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -76,7 +76,7 @@ class ParallelReduce - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); @@ -91,8 +91,8 @@ class ParallelReduce(instance.reduce_memory())); + reference_type update = + reducer.init(static_cast(instance.reduce_memory())); while (work_index != -1) { const Member begin = static_cast(work_index); const Member end = begin + 1 < num_tiles ? begin + 1 : num_tiles; @@ -100,7 +100,7 @@ class ParallelReduce, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -55,7 +55,7 @@ class ParallelReduce, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update) { const TagType t{}; @@ -73,7 +73,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), @@ -89,7 +89,7 @@ class ParallelReduce, } template - static std::enable_if_t::value> + static std::enable_if_t> exec_schedule(ThreadsInternal &instance, const void *arg) { const ParallelReduce &self = *((const ParallelReduce *)arg); const WorkRange range(self.m_policy, instance.pool_rank(), diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp index 4db310701f..69527ee3e6 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelReduce_Team.hpp @@ -42,7 +42,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { for (; member.valid_static(); member.next_static()) { functor(member, update); @@ -50,7 +50,7 @@ class ParallelReduce - inline static std::enable_if_t::value> exec_team( + inline static std::enable_if_t> exec_team( const FunctorType &functor, Member member, reference_type update) { const TagType t{}; for (; member.valid_static(); member.next_static()) { @@ -106,9 +106,14 @@ class ParallelReduce could not find " + "a valid execution configuration."); + policy.impl_set_team_size(team_size); } return policy; } diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp index 62f34d741f..d54f4ca952 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_ParallelScan_Range.hpp @@ -39,7 +39,7 @@ class ParallelScan, const Policy m_policy; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -52,7 +52,7 @@ class ParallelScan, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; @@ -119,7 +119,7 @@ class ParallelScanWithTotal, const pointer_type m_result_ptr; template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { #if defined(KOKKOS_ENABLE_AGGRESSIVE_VECTORIZATION) && \ @@ -132,7 +132,7 @@ class ParallelScanWithTotal, } template - inline static std::enable_if_t::value> exec_range( + inline static std::enable_if_t> exec_range( const FunctorType &functor, const Member &ibeg, const Member &iend, reference_type update, const bool final) { const TagType t{}; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp index 3df9dc07bf..0f9a77f2af 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.cpp @@ -108,7 +108,7 @@ void host_thread_yield(const uint32_t i, const WaitMode mode) { #endif /* defined( KOKKOS_ENABLE_ASM ) */ } -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value) { Kokkos::store_fence(); uint32_t i = 0; diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp index b98b6dbb73..7ab43cdb7a 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Spinwait.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace Kokkos { namespace Impl { @@ -34,7 +35,7 @@ enum class WaitMode : int { void host_thread_yield(const uint32_t i, const WaitMode mode); -void spinwait_while_equal(ThreadState const volatile& flag, +void spinwait_while_equal(std::atomic const& flag, ThreadState const value); } // namespace Impl diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp index a3501a437d..f627e0d47a 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_Team.hpp @@ -143,8 +143,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); @@ -164,8 +164,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; f(value); if (m_team_base) { type* const local_value = ((type*)m_team_base[0]->scratch_memory()); memory_fence(); @@ -186,7 +186,7 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: using type = - typename if_c::type; + std::conditional_t; if (team_rank() != team_size() - 1) * ((volatile type*)m_instance->scratch_memory()) = value; @@ -215,52 +215,65 @@ class ThreadsExecTeamMember { } template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - team_reduce(const ReducerType& reducer, - const typename ReducerType::value_type contribution) const { + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + team_reduce(const ReducerType& reducer, + typename ReducerType::value_type& contribution) const { KOKKOS_IF_ON_DEVICE(((void)reducer; (void)contribution;)) - KOKKOS_IF_ON_HOST(( - using value_type = typename ReducerType::value_type; - // Make sure there is enough scratch space: - using type = typename if_c::type; + KOKKOS_IF_ON_HOST( + (using value_type = typename ReducerType::value_type; + using wrapped_reducer_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, TeamPolicy, + ReducerType, value_type>::Reducer; + impl_team_reduce(wrapped_reducer_type(reducer), contribution); + reducer.reference() = contribution;)) + } - type* const local_value = ((type*)m_instance->scratch_memory()); + template + KOKKOS_INLINE_FUNCTION std::enable_if_t::value> + impl_team_reduce( + const WrappedReducerType& wrapped_reducer, + typename WrappedReducerType::value_type& contribution) const { + using value_type = typename WrappedReducerType::value_type; + // Make sure there is enough scratch space: + using type = std::conditional_t; - // Set this thread's contribution - if (team_rank() != team_size() - 1) { *local_value = contribution; } + type* const local_value = ((type*)m_instance->scratch_memory()); - // Fence to make sure the base team member has access: - memory_fence(); + // Set this thread's contribution + if (team_rank() != team_size() - 1) { + *local_value = contribution; + } - if (team_fan_in()) { - // The last thread to synchronize returns true, all other threads - // wait for team_fan_out() - type* const team_value = ((type*)m_team_base[0]->scratch_memory()); + // Fence to make sure the base team member has access: + memory_fence(); - *team_value = contribution; - // Join to the team value: - for (int i = 1; i < m_team_size; ++i) { - reducer.join(*team_value, - *((type*)m_team_base[i]->scratch_memory())); - } + if (team_fan_in()) { + // The last thread to synchronize returns true, all other threads + // wait for team_fan_out() + type* const team_value = ((type*)m_team_base[0]->scratch_memory()); - // Team base thread may "lap" member threads so copy out to their - // local value. - for (int i = 1; i < m_team_size; ++i) { - *((type*)m_team_base[i]->scratch_memory()) = *team_value; - } + *team_value = contribution; + // Join to the team value: + for (int i = 1; i < m_team_size; ++i) { + wrapped_reducer.join(team_value, + ((type*)m_team_base[i]->scratch_memory())); + } - // Fence to make sure all team members have access - memory_fence(); - } + // Team base thread may "lap" member threads so copy out to their + // local value. + for (int i = 1; i < m_team_size; ++i) { + *((type*)m_team_base[i]->scratch_memory()) = *team_value; + } - team_fan_out(); + // Fence to make sure all team members have access + memory_fence(); + } - // Value was changed by the team base - reducer.reference() = *local_value;)) + team_fan_out(); + + contribution = *local_value; } /** \brief Intra-team exclusive prefix sum with team_rank() ordering @@ -278,8 +291,8 @@ class ThreadsExecTeamMember { KOKKOS_IF_ON_DEVICE(((void)global_accum; return value;)) KOKKOS_IF_ON_HOST(( // Make sure there is enough scratch space: - using type = typename if_c::type; + using type = std::conditional_t; volatile type* const work_value = ((type*)m_instance->scratch_memory()); @@ -887,19 +900,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - ValueType intermediate; - Sum sum(intermediate); - sum.init(intermediate); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - ValueType tmp = ValueType(); - lambda(i, tmp); - intermediate += tmp; + lambda(i, value); } - loop_boundaries.thread.team_reduce(sum, intermediate); - result = sum.reference(); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + result = value; } template @@ -907,15 +926,25 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::TeamThreadRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - typename ReducerType::value_type value; - reducer.init(value); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { lambda(i, value); } - loop_boundaries.thread.team_reduce(reducer, value); + loop_boundaries.thread.impl_team_reduce(wrapped_reducer, value); + wrapped_reducer.final(&value); + reducer.reference() = value; } } // namespace Kokkos @@ -950,11 +979,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, ValueType& result) { - result = ValueType(); + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, Lambda, + ValueType>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + using value_type = typename wrapped_reducer_type::value_type; + + wrapped_reducer_type wrapped_reducer(lambda); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, result); + lambda(i, value); } + + wrapped_reducer.final(&value); + result = value; } template @@ -962,11 +1004,24 @@ KOKKOS_INLINE_FUNCTION std::enable_if_t::value> parallel_reduce(const Impl::ThreadVectorRangeBoundariesStruct< iType, Impl::ThreadsExecTeamMember>& loop_boundaries, const Lambda& lambda, const ReducerType& reducer) { - reducer.init(reducer.reference()); + using value_type = typename ReducerType::value_type; + using functor_analysis_type = typename Impl::FunctorAnalysis< + Impl::FunctorPatternInterface::REDUCE, + TeamPolicy, + ReducerType, value_type>; + using wrapped_reducer_type = typename functor_analysis_type::Reducer; + + wrapped_reducer_type wrapped_reducer(reducer); + value_type value; + wrapped_reducer.init(&value); + for (iType i = loop_boundaries.start; i < loop_boundaries.end; i += loop_boundaries.increment) { - lambda(i, reducer.reference()); + lambda(i, value); } + + wrapped_reducer.final(&value); + reducer.reference() = value; } /** \brief Inter-thread parallel exclusive prefix sum. Executes @@ -1049,7 +1104,7 @@ KOKKOS_INLINE_FUNCTION void parallel_scan( typename Impl::FunctorAnalysis, FunctorType, void>::value_type; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Non-matching value types of closure and return type"); ValueType scan_val = ValueType(); diff --git a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp index c88d66db5f..5fed92db26 100644 --- a/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp +++ b/lib/kokkos/core/src/Threads/Kokkos_Threads_WorkGraphPolicy.hpp @@ -36,13 +36,13 @@ class ParallelFor, FunctorType m_functor; template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { m_functor(w); } template - std::enable_if_t::value> exec_one( + std::enable_if_t> exec_one( const std::int32_t w) const noexcept { const TagType t{}; m_functor(t, w); diff --git a/lib/kokkos/core/src/View/Kokkos_BasicView.hpp b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp new file mode 100644 index 0000000000..29eafca62e --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_BasicView.hpp @@ -0,0 +1,652 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif + +#ifndef KOKKOS_BASIC_VIEW_HPP +#define KOKKOS_BASIC_VIEW_HPP +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +// FIXME: we need to make this work for not using our mdspan impl +#define KOKKOS_IMPL_NO_UNIQUE_ADDRESS _MDSPAN_NO_UNIQUE_ADDRESS +namespace Kokkos::Impl { + +constexpr inline struct SubViewCtorTag { + explicit SubViewCtorTag() = default; +} subview_ctor_tag{}; + +template +struct KokkosSliceToMDSpanSliceImpl { + using type = T; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(const T &s) { return s; } +}; + +template <> +struct KokkosSliceToMDSpanSliceImpl { + using type = full_extent_t; + KOKKOS_FUNCTION + static constexpr decltype(auto) transform(Kokkos::ALL_t) { + return full_extent; + } +}; + +template +using kokkos_slice_to_mdspan_slice = + typename KokkosSliceToMDSpanSliceImpl::type; + +template +KOKKOS_INLINE_FUNCTION constexpr decltype(auto) +transform_kokkos_slice_to_mdspan_slice(const T &s) { + return KokkosSliceToMDSpanSliceImpl::transform(s); +} + +// We do have implementation detail versions of these in our mdspan impl +// However they are not part of the public standard interface +template +struct is_layout_right_padded : public std::false_type {}; + +template +struct is_layout_right_padded> + : public std::true_type {}; + +template +struct is_layout_left_padded : public std::false_type {}; + +template +struct is_layout_left_padded> + : public std::true_type {}; + +template +class BasicView { + public: + using mdspan_type = + mdspan; + using extents_type = typename mdspan_type::extents_type; + using layout_type = typename mdspan_type::layout_type; + using accessor_type = typename mdspan_type::accessor_type; + using mapping_type = typename mdspan_type::mapping_type; + using element_type = typename mdspan_type::element_type; + using value_type = typename mdspan_type::value_type; + using index_type = typename mdspan_type::index_type; + using size_type = typename mdspan_type::size_type; + using rank_type = typename mdspan_type::rank_type; + using data_handle_type = typename mdspan_type::data_handle_type; + using reference = typename mdspan_type::reference; + using memory_space = typename accessor_type::memory_space; + using execution_space = typename memory_space::execution_space; + + // For now View and BasicView will have a restriction that the data handle + // needs to be convertible to element_type* and vice versa + static_assert(std::is_constructible_v); + static_assert(std::is_constructible_v); + + KOKKOS_FUNCTION static constexpr rank_type rank() noexcept { + return extents_type::rank(); + } + KOKKOS_FUNCTION static constexpr rank_type rank_dynamic() noexcept { + return extents_type::rank_dynamic(); + } + KOKKOS_FUNCTION static constexpr size_t static_extent(rank_type r) noexcept { + return extents_type::static_extent(r); + } + KOKKOS_FUNCTION constexpr index_type extent(rank_type r) const noexcept { + return m_map.extents().extent(r); + }; + + protected: + // These are pre-condition checks which are unconditionally (i.e. in release + // mode) enabled in Kokkos::View 4.4 + template + KOKKOS_FUNCTION static constexpr void check_basic_view_constructibility( + [[maybe_unused]] const OtherMapping &rhs) { + using src_t = typename OtherMapping::layout_type; + using dst_t = layout_type; + constexpr size_t rnk = mdspan_type::rank(); + if constexpr (!std::is_same_v) { + if constexpr (Impl::is_layout_left_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == 0 ? rhs.stride(1) : rhs.extents().extent(r)); + } + } + } + if constexpr (Impl::is_layout_right_padded::value) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + if constexpr (rnk > 1) + stride *= (r == rnk ? rhs.stride(r - 2) + : rhs.extents().extent(r - 1)); + } + } + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + for (size_t r = 0; r < rnk; r++) { + if (rhs.stride(r) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r); + } + } else if constexpr (Impl::is_layout_left_padded::value && + rnk > 1) { + if (rhs.stride(1) != rhs.extents().extent(0)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + if constexpr (std::is_same_v) { + if constexpr (std::is_same_v) { + index_type stride = 1; + if constexpr (rnk > 0) { + for (size_t r = rnk; r > 0; r--) { + if (rhs.stride(r - 1) != stride) + Kokkos::abort("View assignment must have compatible layouts"); + stride *= rhs.extents().extent(r - 1); + } + } + } else if constexpr (Impl::is_layout_right_padded::value && + rnk > 1) { + if (rhs.stride(rnk - 2) != rhs.extents().extent(rnk - 1)) + Kokkos::abort("View assignment must have compatible layouts"); + } + } + } + } + + public: + KOKKOS_DEFAULTED_FUNCTION constexpr BasicView() = default; + + KOKKOS_FUNCTION constexpr BasicView(const mdspan_type &other) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()){}; + KOKKOS_FUNCTION constexpr BasicView(mdspan_type &&other) + : m_ptr(std::move(other.data_handle())), + m_map(std::move(other.mapping())), + m_acc(std::move(other.accessor())){}; + + template + // requires(std::is_constructible_v) + KOKKOS_FUNCTION explicit constexpr BasicView( + std::enable_if_t, + data_handle_type> + p, + OtherIndexTypes... exts) + : m_ptr(std::move(p)), + m_map(extents_type(static_cast(std::move(exts))...)), + m_acc{} {} + + template + // When doing C++20 we should switch to this, the conditional explicit we + // can't do in 17 + // requires(std::is_constructible_v>) + // explicit(Size != rank_dynamic()) + KOKKOS_FUNCTION constexpr BasicView( + std::enable_if_t< + std::is_constructible_v>, + data_handle_type> + p, + const Array &exts) + : m_ptr(std::move(p)), m_map(extents_type(exts)), m_acc{} {} + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, + const extents_type &exts) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v && + std::is_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(exts), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m) +// Compilation will simply fail in C++17 and overload set should not be an issue +#ifndef KOKKOS_ENABLE_CXX17 + requires(std::is_default_constructible_v) +#endif + : m_ptr(std::move(p)), m_map(m), m_acc{} { + } + + KOKKOS_FUNCTION constexpr BasicView(data_handle_type p, const mapping_type &m, + const accessor_type &a) + : m_ptr(std::move(p)), m_map(m), m_acc(a) {} + + template +// requires(std::is_constructible_v::mdspan_type>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const BasicView &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, typename BasicView::mdspan_type>, + void *> = nullptr) + : m_ptr(other.m_ptr), m_map(other.m_map), m_acc(other.m_acc) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + template +// requires(std::is_constructible_v>) +#ifndef KOKKOS_ENABLE_CXX17 + explicit( + !std::is_convertible_v &, + mapping_type> || + !std::is_convertible_v) +#endif + KOKKOS_INLINE_FUNCTION + BasicView(const mdspan &other, + std::enable_if_t< + std::is_constructible_v< + mdspan_type, mdspan>, + void *> = nullptr) + : m_ptr(other.data_handle()), + m_map(other.mapping()), + m_acc(other.accessor()) { + // Kokkos View precondition checks happen in release builds + check_basic_view_constructibility(other.mapping()); + + static_assert( + std::is_constructible_v, + "Kokkos::View: incompatible data_handle_type for View construction"); + static_assert(std::is_constructible_v, + "Kokkos::View: incompatible extents for View construction"); + } + + // Allocating constructors specific to BasicView + /// + /// Construct from a given mapping + /// + explicit constexpr BasicView(const std::string &label, + const mapping_type &mapping) + : BasicView(view_alloc(label), mapping) {} + + /// + /// Construct from a given extents + /// + explicit constexpr BasicView(const std::string &label, + const extents_type &ext) + : BasicView(view_alloc(label), mapping_type{ext}) {} + + private: + template + data_handle_type create_data_handle( + const Impl::ViewCtorProp &arg_prop, + const typename mdspan_type::mapping_type &arg_mapping) { + constexpr bool has_exec = Impl::ViewCtorProp::has_execution_space; + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, memory_space{}, execution_space{}); + using alloc_prop = decltype(prop_copy); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + return data_handle_type(Impl::make_shared_allocation_record( + arg_mapping.required_span_size(), + Impl::get_property(prop_copy), + Impl::get_property(prop_copy), + has_exec ? std::optional{Impl::get_property< + Impl::ExecutionSpaceTag>(prop_copy)} + : std::optional{std::nullopt}, + std::integral_constant(), + std::integral_constant())); + } + + public: + template + // requires(!Impl::ViewCtorProp::has_pointer) + explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView(create_data_handle(arg_prop, arg_mapping), arg_mapping) {} + + template + // requires(Impl::ViewCtorProp::has_pointer) + KOKKOS_FUNCTION explicit inline BasicView( + const Impl::ViewCtorProp &arg_prop, + std::enable_if_t::has_pointer, + typename mdspan_type::mapping_type> const &arg_mapping) + : BasicView( + data_handle_type(Impl::get_property(arg_prop)), + arg_mapping) {} + + protected: + template + KOKKOS_INLINE_FUNCTION BasicView( + Impl::SubViewCtorTag, + const BasicView &src_view, + SliceSpecifiers... slices) + : BasicView(submdspan( + src_view.to_mdspan(), + Impl::transform_kokkos_slice_to_mdspan_slice(slices)...)) {} + + public: + //---------------------------------------- + // Conversion to MDSpan + template , + mdspan_type>>> + KOKKOS_INLINE_FUNCTION constexpr + operator mdspan() const { + return mdspan_type(m_ptr, m_map, m_acc); + } + + // Here we use an overload instead of a default parameter as a workaround + // to a potential compiler bug with clang 17. It may be present in other + // compilers + template >> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan() const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), static_cast(accessor())); + } + + template < + class OtherAccessorType = AccessorPolicy, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType &other_accessor) const { + using ret_mdspan_type = + mdspan; + return ret_mdspan_type( + static_cast( + data_handle()), + mapping(), other_accessor); + } + + KOKKOS_FUNCTION void assign_data(element_type *ptr) { m_ptr = ptr; } + + // ========================= mdspan ================================= + + // [mdspan.mdspan.members], members + +// Introducing the C++20 and C++23 variants of the operators already +#ifndef KOKKOS_ENABLE_CXX17 +#ifndef KOKKOS_ENABLE_CXX20 + // C++23 only operator[] + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator[]( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator[]( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#endif + + // C++20 operator() + template + requires((std::is_convertible_v && ...) && + (std::is_nothrow_constructible_v && + ...) && + (sizeof...(OtherIndexTypes) == rank())) + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + const Array &indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } + + template + requires( + std::is_convertible_v && + std::is_nothrow_constructible_v) + KOKKOS_FUNCTION constexpr reference operator()( + std::span indices) const { + return m_acc.access(m_ptr, + [&](std::index_sequence) { + return m_map(indices[Idxs]...); + }(std::make_index_sequence())); + } +#else + // C++17 variant of operator() + + // Some weird unexplained issue in compiling the SFINAE version with CUDA/MSVC + // So we just use post factor check here with static_assert +#if defined(KOKKOS_ENABLE_CUDA) && defined(_WIN32) + template + KOKKOS_FUNCTION constexpr reference operator()( + OtherIndexTypes... indices) const { + static_assert((std::is_convertible_v && ...)); + static_assert( + (std::is_nothrow_constructible_v && ...)); + static_assert((sizeof...(OtherIndexTypes)) == rank()); + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#else + template + KOKKOS_FUNCTION constexpr std::enable_if_t< + ((std::is_convertible_v && ...)) && + ((std::is_nothrow_constructible_v && + ...)) && + ((sizeof...(OtherIndexTypes)) == rank()), + reference> + operator()(OtherIndexTypes... indices) const { + return m_acc.access(m_ptr, + m_map(static_cast(std::move(indices))...)); + } +#endif +#endif + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside size() + template + KOKKOS_FUNCTION constexpr size_type size_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return 0u; + return ((static_cast(m_map.extents().extent(Idxs))) * ... * + size_type(1)); + } + + public: + KOKKOS_FUNCTION constexpr size_type size() const noexcept { + return size_impl(std::make_index_sequence()); + } + + private: + // FIXME_CXX20: could use inline templated lambda in C++20 mode inside empty() + template + KOKKOS_FUNCTION constexpr bool empty_impl( + std::index_sequence) const noexcept { + // Note we restrict data_handle to be convertible to element_type* for now. + // This is also different from mdspan: mdspan can NOT be legally in a state + // where m_ptr is nullptr and the product of extents is non-zero + // The default constructor of mdspan is constrained to dynamic_rank > 0 + // For View we do not have that constraint today + if (data_handle() == nullptr) return true; + return (rank() > 0) && + ((m_map.extents().extent(Idxs) == index_type(0)) || ... || false); + } + + public: + [[nodiscard]] KOKKOS_FUNCTION constexpr bool empty() const noexcept { + return empty_impl(std::make_index_sequence()); + } + + KOKKOS_FUNCTION friend constexpr void swap(BasicView &x, + BasicView &y) noexcept { + kokkos_swap(x.m_ptr, y.m_ptr); + kokkos_swap(x.m_map, y.m_map); + kokkos_swap(x.m_acc, y.m_acc); + } + + KOKKOS_FUNCTION constexpr const extents_type &extents() const noexcept { + return m_map.extents(); + }; + KOKKOS_FUNCTION constexpr const data_handle_type &data_handle() + const noexcept { + return m_ptr; + }; + KOKKOS_FUNCTION constexpr const mapping_type &mapping() const noexcept { + return m_map; + }; + KOKKOS_FUNCTION constexpr const accessor_type &accessor() const noexcept { + return m_acc; + }; + + KOKKOS_FUNCTION static constexpr bool is_always_unique() noexcept { + return mapping_type::is_always_unique(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_exhaustive() noexcept { + return mapping_type::is_always_exhaustive(); + }; + KOKKOS_FUNCTION static constexpr bool is_always_strided() noexcept { + return mapping_type::is_always_strided(); + }; + + KOKKOS_FUNCTION constexpr bool is_unique() const { + return m_map.is_unique(); + }; + KOKKOS_FUNCTION constexpr bool is_exhaustive() const { + return m_map.is_exhaustive(); + }; + KOKKOS_FUNCTION constexpr bool is_strided() const { + return m_map.is_strided(); + }; + KOKKOS_FUNCTION constexpr index_type stride(rank_type r) const { + return m_map.stride(r); + }; + + protected: +#ifndef __NVCC__ + KOKKOS_IMPL_NO_UNIQUE_ADDRESS data_handle_type m_ptr{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS mapping_type m_map{}; + KOKKOS_IMPL_NO_UNIQUE_ADDRESS accessor_type m_acc{}; +#else + data_handle_type m_ptr{}; + mapping_type m_map{}; + accessor_type m_acc{}; +#endif + + template + friend class BasicView; +}; +} // namespace Kokkos::Impl + +#endif diff --git a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp index 1ade75692f..eb11630b21 100644 --- a/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAlloc.hpp @@ -26,6 +26,7 @@ static_assert(false, #include #include #include +#include #include #include @@ -41,22 +42,8 @@ bool is_zero_byte(const T& x) { return std::memcmp(&x, all_zeroes, sizeof(T)) == 0; } -//---------------------------------------------------------------------------- - -/* - * The construction, assignment to default, and destruction - * are merged into a single functor. - * Primarily to work around an unresolved CUDA back-end bug - * that would lose the destruction cuda device function when - * called from the shared memory tracking destruction. - * Secondarily to have two fewer partial specializations. - */ -template ::value> -struct ViewValueFunctor; - template -struct ViewValueFunctor { +struct ViewValueFunctor { using ExecSpace = typename DeviceType::execution_space; struct DestroyTag {}; @@ -68,20 +55,31 @@ struct ViewValueFunctor { std::string name; bool default_exec_space; - template - KOKKOS_INLINE_FUNCTION - std::enable_if_t::value> - operator()(ConstructTag const&, const size_t i) const { + template + KOKKOS_FUNCTION + std::enable_if_t> + operator()(ConstructTag, const size_t i) const { new (ptr + i) ValueType(); } - KOKKOS_INLINE_FUNCTION void operator()(DestroyTag const&, - const size_t i) const { + KOKKOS_FUNCTION void operator()(DestroyTag, const size_t i) const { + // When instantiating a View on host execution space with a host only + // destructor the workaround for CUDA device symbol instantiation tries to + // still compile a destruction kernel for the device, and issues a warning + // for host from host-device +#ifdef KOKKOS_ENABLE_CUDA + if constexpr (std::is_same_v) { + KOKKOS_IF_ON_DEVICE(((ptr + i)->~ValueType();)) + } else { + KOKKOS_IF_ON_HOST(((ptr + i)->~ValueType();)) + } +#else (ptr + i)->~ValueType(); +#endif } - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; + ViewValueFunctor() = default; + ViewValueFunctor(const ViewValueFunctor&) = default; ViewValueFunctor& operator=(const ViewValueFunctor&) = default; ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, @@ -104,49 +102,6 @@ struct ViewValueFunctor { functor_instantiate_workaround(); } - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_dispatch() { - ValueType value{}; -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_dispatch() { - parallel_for_implementation(); - } - template void parallel_for_implementation() { using PolicyType = @@ -172,24 +127,62 @@ struct ViewValueFunctor { const Kokkos::Impl::ParallelFor closure( *this, policy); closure.execute(); - if (default_exec_space || std::is_same_v) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); if (Kokkos::Profiling::profileLibraryLoaded()) { Kokkos::Profiling::endParallelFor(kpID); } + if (default_exec_space || std::is_same_v) { + space.fence(std::is_same_v + ? "Kokkos::View::destruction before deallocate" + : "Kokkos::View::initialization"); + } } - void construct_shared_allocation() { construct_dispatch(); } + // Shortcut for zero initialization + void zero_memset_implementation() { + uint64_t kpID = 0; + if (Kokkos::Profiling::profileLibraryLoaded()) { + // We are not really using parallel_for here but using beginParallelFor + // instead of begin_parallel_for (and adding "via memset") is the best + // we can do to indicate that this is not supposed to be tunable (and + // doesn't really execute a parallel_for). + Kokkos::Profiling::beginParallelFor( + "Kokkos::View::initialization [" + name + "] via memset", + Kokkos::Profiling::Experimental::device_id(space), &kpID); + } + + (void)ZeroMemset(space, ptr, n * sizeof(ValueType)); + + if (Kokkos::Profiling::profileLibraryLoaded()) { + Kokkos::Profiling::endParallelFor(kpID); + } + if (default_exec_space) { + space.fence("Kokkos::View::initialization via memset"); + } + } + + void construct_shared_allocation() { +// On A64FX memset seems to do the wrong thing with regards to first touch +// leading to the significant performance issues +#ifndef KOKKOS_ARCH_A64FX + if constexpr (std::is_trivial_v) { + // value-initialization is equivalent to filling with zeros + zero_memset_implementation(); + } else +#endif + parallel_for_implementation(); + } void destroy_shared_allocation() { + if constexpr (std::is_trivially_destructible_v) { + // do nothing, don't bother calling the destructor + } else { #ifdef KOKKOS_ENABLE_IMPL_VIEW_OF_VIEWS_DESTRUCTOR_PRECONDITION_VIOLATION_WORKAROUND - if constexpr (std::is_same_v) - for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); - else + if constexpr (std::is_same_v) + for (size_t i = 0; i < n; ++i) (ptr + i)->~ValueType(); + else #endif - { - parallel_for_implementation(); + parallel_for_implementation(); } } @@ -206,114 +199,6 @@ struct ViewValueFunctor { } }; -template -struct ViewValueFunctor { - using ExecSpace = typename DeviceType::execution_space; - using PolicyType = Kokkos::RangePolicy>; - - ExecSpace space; - ValueType* ptr; - size_t n; - std::string name; - bool default_exec_space; - - KOKKOS_INLINE_FUNCTION - void operator()(const size_t i) const { ptr[i] = ValueType(); } - - ViewValueFunctor() = default; - ViewValueFunctor(const ViewValueFunctor&) = default; - ViewValueFunctor& operator=(const ViewValueFunctor&) = default; - - ViewValueFunctor(ExecSpace const& arg_space, ValueType* const arg_ptr, - size_t const arg_n, std::string arg_name) - : space(arg_space), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(false) {} - - ViewValueFunctor(ValueType* const arg_ptr, size_t const arg_n, - std::string arg_name) - : space(ExecSpace{}), - ptr(arg_ptr), - n(arg_n), - name(std::move(arg_name)), - default_exec_space(true) {} - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value> - construct_shared_allocation() { - // Shortcut for zero initialization -// On A64FX memset seems to do the wrong thing with regards to first touch -// leading to the significant performance issues -#ifndef KOKKOS_ARCH_A64FX - ValueType value{}; - if (Impl::is_zero_byte(value)) { - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - // We are not really using parallel_for here but using beginParallelFor - // instead of begin_parallel_for (and adding "via memset") is the best - // we can do to indicate that this is not supposed to be tunable (and - // doesn't really execute a parallel_for). - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "] via memset", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } - - (void)ZeroMemset( - space, Kokkos::View>(ptr, n)); - - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - if (default_exec_space) - space.fence("Kokkos::Impl::ViewValueFunctor: View init/destroy fence"); - } else { -#endif - parallel_for_implementation(); -#ifndef KOKKOS_ARCH_A64FX - } -#endif - } - - template - std::enable_if_t::value && - std::is_trivially_copy_assignable::value)> - construct_shared_allocation() { - parallel_for_implementation(); - } - - void parallel_for_implementation() { - PolicyType policy(space, 0, n); - uint64_t kpID = 0; - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::beginParallelFor( - "Kokkos::View::initialization [" + name + "]", - Kokkos::Profiling::Experimental::device_id(space), &kpID); - } -#ifdef KOKKOS_ENABLE_CUDA - if (std::is_same::value) { - Kokkos::Impl::cuda_prefetch_pointer(space, ptr, sizeof(ValueType) * n, - true); - } -#endif - const Kokkos::Impl::ParallelFor closure( - *this, policy); - closure.execute(); - if (default_exec_space) - space.fence( - "Kokkos::Impl::ViewValueFunctor: Fence after setting values in " - "view"); - if (Kokkos::Profiling::profileLibraryLoaded()) { - Kokkos::Profiling::endParallelFor(kpID); - } - } - - void destroy_shared_allocation() {} -}; - template struct ViewValueFunctorSequentialHostInit { using ExecSpace = typename DeviceType::execution_space; @@ -358,6 +243,63 @@ struct ViewValueFunctorSequentialHostInit { } }; +template +Kokkos::Impl::SharedAllocationRecord* make_shared_allocation_record( + const size_t& required_span_size, std::string_view label, + const MemorySpace& memory_space, + const std::optional exec_space, + std::bool_constant, std::bool_constant) { + static_assert(SpaceAccessibility::accessible); + + // Use this for constructing and destroying the view + using device_type = Kokkos::Device; + using functor_type = std::conditional_t< + SequentialInit, + ViewValueFunctorSequentialHostInit, + ViewValueFunctor>; + using record_type = + Kokkos::Impl::SharedAllocationRecord; + + /* Force alignment of allocations on on 8 byte boundaries even for + * element types smaller than 8 bytes */ + static constexpr std::size_t align_mask = 0x7; + + // Calculate the total size of the memory, in bytes, and make sure it is + // byte-aligned + const std::size_t alloc_size = + (required_span_size * sizeof(ElementType) + align_mask) & ~align_mask; + + auto* record = + exec_space + ? record_type::allocate(*exec_space, memory_space, std::string{label}, + alloc_size) + : record_type::allocate(memory_space, std::string{label}, alloc_size); + + auto ptr = static_cast(record->data()); + + auto functor = + exec_space ? functor_type(*exec_space, ptr, required_span_size, + std::string{label}) + : functor_type(ptr, required_span_size, std::string{label}); + + // Only initialize if the allocation is non-zero. + // May be zero if one of the dimensions is zero. + if constexpr (Initialize) { + if (alloc_size) { + // Assume destruction is only required when construction is requested. + // The ViewValueFunctor has both value construction and destruction + // operators. + record->m_destroy = std::move(functor); + + // Construct values + record->m_destroy.construct_shared_allocation(); + } + } + + return record; +} + } // namespace Kokkos::Impl #endif // KOKKOS_VIEW_ALLOC_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp index 23d4c2524c..f77066b70f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Atomic_View.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewAtomic.hpp @@ -13,8 +13,8 @@ // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //@HEADER -#ifndef KOKKOS_ATOMIC_VIEW_HPP -#define KOKKOS_ATOMIC_VIEW_HPP +#ifndef KOKKOS_VIEWATOMIC_HPP +#define KOKKOS_VIEWATOMIC_HPP #include #include @@ -44,10 +44,10 @@ class AtomicDataElement { } KOKKOS_INLINE_FUNCTION - void inc() const { Kokkos::atomic_increment(ptr); } + void inc() const { Kokkos::atomic_inc(ptr); } KOKKOS_INLINE_FUNCTION - void dec() const { Kokkos::atomic_decrement(ptr); } + void dec() const { Kokkos::atomic_dec(ptr); } KOKKOS_INLINE_FUNCTION const_value_type operator++() const { @@ -215,7 +215,7 @@ class AtomicViewDataHandle { } KOKKOS_INLINE_FUNCTION - operator typename ViewTraits::value_type*() const { return ptr; } + operator typename ViewTraits::value_type *() const { return ptr; } }; } // namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp similarity index 84% rename from lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp index 379180ae64..f080474717 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewCtor.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewCtor.hpp @@ -72,8 +72,8 @@ struct ViewCtorProp {}; */ template struct ViewCtorProp> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = CommonViewAllocProp; @@ -92,8 +92,8 @@ struct ViewCtorProp || std::is_same_v || std::is_same_v>, P> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = P; @@ -106,14 +106,14 @@ struct ViewCtorProp || /* Map input label type to std::string */ template struct ViewCtorProp::value>, Label> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = std::string; ViewCtorProp(const type &arg) : value(arg) {} - ViewCtorProp(type &&arg) : value(arg) {} + ViewCtorProp(type &&arg) : value(std::move(arg)) {} type value; }; @@ -122,8 +122,8 @@ template struct ViewCtorProp::value || Kokkos::is_execution_space::value>, Space> { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = Space; @@ -135,8 +135,8 @@ struct ViewCtorProp::value || template struct ViewCtorProp { - ViewCtorProp() = default; - ViewCtorProp(const ViewCtorProp &) = default; + ViewCtorProp() = default; + ViewCtorProp(const ViewCtorProp &) = default; ViewCtorProp &operator=(const ViewCtorProp &) = default; using type = T *; @@ -213,14 +213,19 @@ struct ViewCtorProp : public ViewCtorProp... { using execution_space = typename var_execution_space::type; using pointer_type = typename var_pointer::type; - /* Copy from a matching argument list. - * Requires std::is_same< P , ViewCtorProp< void , Args >::value ... - */ - template - inline ViewCtorProp(Args const &... args) : ViewCtorProp(args)... {} + // Construct from a matching argument list. + // + // Note that if P is empty, this constructor is the default constructor. + // On the other hand, if P is not empty, the constraint implies that + // there is no default constructor. + template , Args &&>...>>> + ViewCtorProp(Args &&...args) + : ViewCtorProp(std::forward(args))... {} template - KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &... args) + KOKKOS_FUNCTION ViewCtorProp(pointer_type arg0, Args const &...args) : ViewCtorProp(arg0), ViewCtorProp::type>(args)... {} @@ -252,7 +257,7 @@ auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop) { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, [[maybe_unused]] const Property &property, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -302,7 +307,7 @@ template struct WithPropertiesIfUnset, Property, Properties...> { static constexpr auto apply_prop(const ViewCtorProp &view_ctor_prop, const Property &prop, - const Properties &... properties) { + const Properties &...properties) { if constexpr ((is_execution_space::value && !ViewCtorProp::has_execution_space) || (is_memory_space::value && @@ -328,7 +333,7 @@ struct WithPropertiesIfUnset, Property, Properties...> { template auto with_properties_if_unset(const ViewCtorProp &view_ctor_prop, - const Properties &... properties) { + const Properties &...properties) { return WithPropertiesIfUnset, Properties...>::apply_prop( view_ctor_prop, properties...); } @@ -437,6 +442,48 @@ using ViewAllocateWithoutInitializing = Impl::ViewCtorProp; +inline constexpr Kokkos::Impl::SequentialHostInit_t SequentialHostInit{}; + +inline constexpr Kokkos::Impl::WithoutInitializing_t WithoutInitializing{}; + +inline constexpr Kokkos::Impl::AllowPadding_t AllowPadding{}; + +/** \brief Create View allocation parameter bundle from argument list. + * + * Valid argument list members are: + * 1) label as a "string" or std::string + * 2) memory space instance of the View::memory_space type + * 3) execution space instance compatible with the View::memory_space + * 4) Kokkos::WithoutInitializing to bypass initialization + * 4) Kokkos::AllowPadding to allow allocation to pad dimensions for memory + * alignment + */ +template +auto view_alloc(Args &&...args) { + using return_type = Impl::ViewCtorProp>::type...>; + + static_assert(!return_type::has_pointer, + "Cannot give pointer-to-memory for view allocation"); + + return return_type(std::forward(args)...); +} + +template +KOKKOS_INLINE_FUNCTION + Impl::ViewCtorProp::type...> + view_wrap(Args const &...args) { + using return_type = + Impl::ViewCtorProp::type...>; + + static_assert(!return_type::has_memory_space && + !return_type::has_execution_space && + !return_type::has_label && return_type::has_pointer, + "Must only give pointer-to-memory for view wrapping"); + + return return_type(args...); +} + } /* namespace Kokkos */ //---------------------------------------------------------------------------- diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp similarity index 96% rename from lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp index 04c0c9aeed..37b6e2802f 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewDataAnalysis.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewDataAnalysis.hpp @@ -60,8 +60,8 @@ struct rank_dynamic { static constexpr size_t ArgN##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ static constexpr size_t N##R = (V != KOKKOS_INVALID_INDEX ? V : 1); \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t) {} \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ }; \ template \ @@ -72,8 +72,8 @@ struct rank_dynamic { struct ViewDimension##R<0u, RD> { \ static constexpr size_t ArgN##R = 0; \ std::conditional_t<(RD < 3), size_t, unsigned> N##R; \ - ViewDimension##R() = default; \ - ViewDimension##R(const ViewDimension##R&) = default; \ + ViewDimension##R() = default; \ + ViewDimension##R(const ViewDimension##R&) = default; \ ViewDimension##R& operator=(const ViewDimension##R&) = default; \ KOKKOS_INLINE_FUNCTION explicit ViewDimension##R(size_t V) : N##R(V) {} \ }; \ @@ -149,8 +149,8 @@ struct KOKKOS_IMPL_ENFORCE_EMPTY_BASE_OPTIMIZATION ViewDimension static constexpr unsigned rank = sizeof...(Vals); static constexpr unsigned rank_dynamic = Impl::rank_dynamic::value; - ViewDimension() = default; - ViewDimension(const ViewDimension&) = default; + ViewDimension() = default; + ViewDimension(const ViewDimension&) = default; ViewDimension& operator=(const ViewDimension&) = default; KOKKOS_INLINE_FUNCTION @@ -370,8 +370,7 @@ struct ViewDataAnalysis { // ValueType is opportunity for partial specialization. // Must match array analysis when this default template is used. static_assert( - std::is_same::value); + std::is_same_v); public: using specialize = void; // No specialization diff --git a/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp new file mode 100644 index 0000000000..fd406d58cc --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewLegacy.hpp @@ -0,0 +1,1604 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWLEGACY_HPP +#define KOKKOS_VIEWLEGACY_HPP + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#include +#endif +#include + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +/** \class View + * \brief View to an array of data. + * + * A View represents an array of one or more dimensions. + * For details, please refer to Kokkos' tutorial materials. + * + * \section Kokkos_View_TemplateParameters Template parameters + * + * This class has both required and optional template parameters. The + * \c DataType parameter must always be provided, and must always be + * first. The parameters \c Arg1Type, \c Arg2Type, and \c Arg3Type are + * placeholders for different template parameters. The default value + * of the fifth template parameter \c Specialize suffices for most use + * cases. When explaining the template parameters, we won't refer to + * \c Arg1Type, \c Arg2Type, and \c Arg3Type; instead, we will refer + * to the valid categories of template parameters, in whatever order + * they may occur. + * + * Valid ways in which template arguments may be specified: + * - View< DataType > + * - View< DataType , Layout > + * - View< DataType , Layout , Space > + * - View< DataType , Layout , Space , MemoryTraits > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + * + * \tparam DataType (required) This indicates both the type of each + * entry of the array, and the combination of compile-time and + * run-time array dimension(s). For example, double* + * indicates a one-dimensional array of \c double with run-time + * dimension, and int*[3] a two-dimensional array of \c int + * with run-time first dimension and compile-time second dimension + * (of 3). In general, the run-time dimensions (if any) must go + * first, followed by zero or more compile-time dimensions. For + * more examples, please refer to the tutorial materials. + * + * \tparam Space (required) The memory space. + * + * \tparam Layout (optional) The array's layout in memory. For + * example, LayoutLeft indicates a column-major (Fortran style) + * layout, and LayoutRight a row-major (C style) layout. If not + * specified, this defaults to the preferred layout for the + * Space. + * + * \tparam MemoryTraits (optional) Assertion of the user's intended + * access behavior. For example, RandomAccess indicates read-only + * access with limited spatial locality, and Unmanaged lets users + * wrap externally allocated memory in a View without automatic + * deallocation. + * + * \section Kokkos_View_MT MemoryTraits discussion + * + * \subsection Kokkos_View_MT_Interp MemoryTraits interpretation depends on + * Space + * + * Some \c MemoryTraits options may have different interpretations for + * different \c Space types. For example, with the Cuda device, + * \c RandomAccess tells Kokkos to fetch the data through the texture + * cache, whereas the non-GPU devices have no such hardware construct. + * + * \subsection Kokkos_View_MT_PrefUse Preferred use of MemoryTraits + * + * Users should defer applying the optional \c MemoryTraits parameter + * until the point at which they actually plan to rely on it in a + * computational kernel. This minimizes the number of template + * parameters exposed in their code, which reduces the cost of + * compilation. Users may always assign a View without specified + * \c MemoryTraits to a compatible View with that specification. + * For example: + * \code + * // Pass in the simplest types of View possible. + * void + * doSomething (View out, + * View in) + * { + * // Assign the "generic" View in to a RandomAccess View in_rr. + * // Note that RandomAccess View objects must have const data. + * View in_rr = in; + * // ... do something with in_rr and out ... + * } + * \endcode + */ + +} // namespace Kokkos + +namespace Kokkos { + +template +struct is_always_assignable_impl; + +template +struct is_always_assignable_impl, + Kokkos::View> { + using mapping_type = Kokkos::Impl::ViewMapping< + typename Kokkos::View::traits, + typename Kokkos::View::traits, + typename Kokkos::View::traits::specialize>; + + constexpr static bool value = + mapping_type::is_assignable && + static_cast(Kokkos::View::rank_dynamic) >= + static_cast(Kokkos::View::rank_dynamic); +}; + +template +using is_always_assignable = is_always_assignable_impl< + std::remove_reference_t, + std::remove_const_t>>; + +template +inline constexpr bool is_always_assignable_v = + is_always_assignable::value; + +template +constexpr bool is_assignable(const Kokkos::View& dst, + const Kokkos::View& src) { + using DstTraits = typename Kokkos::View::traits; + using SrcTraits = typename Kokkos::View::traits; + using mapping_type = + Kokkos::Impl::ViewMapping; + + return is_always_assignable_v, + Kokkos::View> || + (mapping_type::is_assignable && + ((DstTraits::dimension::rank_dynamic >= 1) || + (dst.static_extent(0) == src.extent(0))) && + ((DstTraits::dimension::rank_dynamic >= 2) || + (dst.static_extent(1) == src.extent(1))) && + ((DstTraits::dimension::rank_dynamic >= 3) || + (dst.static_extent(2) == src.extent(2))) && + ((DstTraits::dimension::rank_dynamic >= 4) || + (dst.static_extent(3) == src.extent(3))) && + ((DstTraits::dimension::rank_dynamic >= 5) || + (dst.static_extent(4) == src.extent(4))) && + ((DstTraits::dimension::rank_dynamic >= 6) || + (dst.static_extent(5) == src.extent(5))) && + ((DstTraits::dimension::rank_dynamic >= 7) || + (dst.static_extent(6) == src.extent(6))) && + ((DstTraits::dimension::rank_dynamic >= 8) || + (dst.static_extent(7) == src.extent(7)))); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +class View; + +template +struct is_view : public std::false_type {}; + +template +struct is_view> : public std::true_type {}; + +template +struct is_view> : public std::true_type {}; + +template +inline constexpr bool is_view_v = is_view::value; + +template +class View : public ViewTraits { + private: + template + friend class View; + template + friend class Kokkos::Impl::ViewMapping; + + using view_tracker_type = Kokkos::Impl::ViewTracker; + + public: + using traits = ViewTraits; + + private: + using map_type = + Kokkos::Impl::ViewMapping; + template + friend struct Kokkos::Impl::ViewTracker; + using hooks_policy = typename traits::hooks_policy; + + view_tracker_type m_track; + map_type m_map; + + public: + //---------------------------------------- + /** \brief Compatible view of array of scalar types */ + using array_type = + View; + + /** \brief Compatible view of const data type */ + using const_type = + View; + + /** \brief Compatible view of non-const data type */ + using non_const_type = + View; + + /** \brief Compatible host mirror view */ + using host_mirror_type = + View, + typename traits::hooks_policy>; + + /** \brief Compatible host mirror view */ + using HostMirror = host_mirror_type; + + /** \brief Unified types */ + using uniform_type = typename Impl::ViewUniformType::type; + using uniform_const_type = + typename Impl::ViewUniformType::const_type; + using uniform_runtime_type = + typename Impl::ViewUniformType::runtime_type; + using uniform_runtime_const_type = + typename Impl::ViewUniformType::runtime_const_type; + using uniform_nomemspace_type = + typename Impl::ViewUniformType::nomemspace_type; + using uniform_const_nomemspace_type = + typename Impl::ViewUniformType::const_nomemspace_type; + using uniform_runtime_nomemspace_type = + typename Impl::ViewUniformType::runtime_nomemspace_type; + using uniform_runtime_const_nomemspace_type = + typename Impl::ViewUniformType::runtime_const_nomemspace_type; + + using reference_type = typename map_type::reference_type; + using pointer_type = typename map_type::pointer_type; + + // Typedefs from mdspan + // using extents_type -> not applicable + // Defining layout_type here made MSVC+CUDA fail + // using layout_type = typename traits::array_layout; + // using accessor_type -> not applicable + // using mapping_type -> not applicable + using element_type = typename traits::value_type; + // using value_type -> conflicts with traits::value_type + using index_type = typename traits::memory_space::size_type; + // using size_type -> already from traits::size_type; where it is + // memory_space::size_type + using rank_type = size_t; + using data_handle_type = pointer_type; + using reference = reference_type; + + //---------------------------------------- + // Domain rank and extents + + static constexpr Impl::integral_constant + rank = {}; + static constexpr Impl::integral_constant + rank_dynamic = {}; +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 + enum {Rank KOKKOS_DEPRECATED_WITH_COMMENT("Use rank instead.") = + map_type::Rank}; +#endif + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + extent(const iType& r) const noexcept { + return m_map.extent(r); + } + + static KOKKOS_INLINE_FUNCTION constexpr size_t static_extent( + const unsigned r) noexcept { + return map_type::static_extent(r); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + int> + extent_int(const iType& r) const noexcept { + return static_cast(m_map.extent(r)); + } + + KOKKOS_INLINE_FUNCTION constexpr typename traits::array_layout layout() + const { + return m_map.layout(); + } + + //---------------------------------------- + /* Deprecate all 'dimension' functions in favor of + * ISO/C++ vocabulary 'extent'. + */ + + KOKKOS_INLINE_FUNCTION constexpr size_t size() const { + return m_map.dimension_0() * m_map.dimension_1() * m_map.dimension_2() * + m_map.dimension_3() * m_map.dimension_4() * m_map.dimension_5() * + m_map.dimension_6() * m_map.dimension_7(); + } + + KOKKOS_INLINE_FUNCTION constexpr size_t stride_0() const { + return m_map.stride_0(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_1() const { + return m_map.stride_1(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_2() const { + return m_map.stride_2(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_3() const { + return m_map.stride_3(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_4() const { + return m_map.stride_4(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_5() const { + return m_map.stride_5(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_6() const { + return m_map.stride_6(); + } + KOKKOS_INLINE_FUNCTION constexpr size_t stride_7() const { + return m_map.stride_7(); + } + + template + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t, + size_t> + stride(iType r) const { + return ( + r == 0 + ? m_map.stride_0() + : (r == 1 + ? m_map.stride_1() + : (r == 2 + ? m_map.stride_2() + : (r == 3 + ? m_map.stride_3() + : (r == 4 + ? m_map.stride_4() + : (r == 5 + ? m_map.stride_5() + : (r == 6 + ? m_map.stride_6() + : m_map.stride_7()))))))); + } + + template + KOKKOS_INLINE_FUNCTION void stride(iType* const s) const { + m_map.stride(s); + } + + //---------------------------------------- + // Range span is the span which contains all members. + + enum { + reference_type_is_lvalue_reference = + std::is_lvalue_reference_v + }; + + KOKKOS_INLINE_FUNCTION constexpr size_t span() const { return m_map.span(); } + KOKKOS_INLINE_FUNCTION bool span_is_contiguous() const { + return m_map.span_is_contiguous(); + } + KOKKOS_INLINE_FUNCTION constexpr bool is_allocated() const { + return m_map.data() != nullptr; + } + KOKKOS_INLINE_FUNCTION constexpr pointer_type data() const { + return m_map.data(); + } + + //---------------------------------------- + // Allow specializations to query their specialized map + + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::ViewMapping& + impl_map() const { + return m_map; + } + KOKKOS_INLINE_FUNCTION + const Kokkos::Impl::SharedAllocationTracker& impl_track() const { + return m_track.m_tracker; + } + //---------------------------------------- + + private: + static constexpr bool is_layout_left = + std::is_same_v; + + static constexpr bool is_layout_right = + std::is_same_v; + + static constexpr bool is_layout_stride = + std::is_same_v; + + static constexpr bool is_default_map = + std::is_void_v && + (is_layout_left || is_layout_right || is_layout_stride); + +#if defined(KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK) + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); \ + Kokkos::Impl::view_verify_operator_bounds( \ + __VA_ARGS__); + +#else + +#define KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(...) \ + Kokkos::Impl::runtime_check_memory_access_violation< \ + typename traits::memory_space>( \ + "Kokkos::View ERROR: attempt to access inaccessible memory space", \ + __VA_ARGS__); + +#endif + + template + static KOKKOS_FUNCTION void check_access_member_function_valid_args(Is...) { + static_assert(rank <= sizeof...(Is)); + static_assert(sizeof...(Is) <= 8); + static_assert(Kokkos::Impl::are_integral::value); + } + + template + static KOKKOS_FUNCTION void check_operator_parens_valid_args(Is...) { + static_assert(rank == sizeof...(Is)); + static_assert(Kokkos::Impl::are_integral::value); + } + + public: + //------------------------------ + // Rank 1 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + operator()(I0 i0) const { + check_operator_parens_valid_args(i0); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 1 operator[] + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + ((1 == rank) && Kokkos::Impl::are_integral::value && !is_default_map), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && !is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<((1 == rank) && Kokkos::Impl::are_integral::value && + is_default_map && is_layout_stride), + reference_type> + operator[](I0 i0) const { + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 default map operator() + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 == rank) && is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + operator()(I0 i0, I1 i1) const { + check_operator_parens_valid_args(i0, i1); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + // Rank 0 -> 8 operator() except for rank-1 and rank-2 with default map which + // have "inlined" versions above + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && // + (2 != rank) && (1 != rank) && (0 != rank) && is_default_map), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.m_impl_handle[m_map.m_impl_offset(indices...)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && // + ((0 == rank) || !is_default_map)), + reference_type> + operator()(Is... indices) const { + check_operator_parens_valid_args(indices...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, indices...) + return m_map.reference(indices...); + } + + //------------------------------ + // Rank 0 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (0 == rank)), reference_type> + access(Is... extra) const { + check_access_member_function_valid_args(extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, extra...) + return m_map.reference(); + } + + //------------------------------ + // Rank 1 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && !is_default_map), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.reference(i0); + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && !is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[i0]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (1 == rank) && is_default_map && is_layout_stride), + reference_type> + access(I0 i0, Is... extra) const { + check_access_member_function_valid_args(i0, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset.m_stride.S0 * i0]; + } + + //------------------------------ + // Rank 2 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (2 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + return m_map.reference(i0, i1); + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (2 == rank) && + is_default_map && + (is_layout_left || is_layout_right || is_layout_stride)), + reference_type> + access(I0 i0, I1 i1, Is... extra) const { + check_access_member_function_valid_args(i0, i1, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, extra...) + if constexpr (is_layout_left) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_dim.N0 * i1]; + else + return m_map.m_impl_handle[i0 + m_map.m_impl_offset.m_stride * i1]; + } else if constexpr (is_layout_right) { + if constexpr (rank_dynamic == 0) + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_dim.N1 * i0]; + else + return m_map.m_impl_handle[i1 + m_map.m_impl_offset.m_stride * i0]; + } else { + static_assert(is_layout_stride); + return m_map.m_impl_handle[i0 * m_map.m_impl_offset.m_stride.S0 + + i1 * m_map.m_impl_offset.m_stride.S1]; + } +#if defined KOKKOS_COMPILER_INTEL + __builtin_unreachable(); +#endif + } + + //------------------------------ + // Rank 3 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (3 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, extra...) + return m_map.reference(i0, i1, i2); + } + + //------------------------------ + // Rank 4 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && (4 == rank) && + !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, extra...) + return m_map.reference(i0, i1, i2, i3); + } + + //------------------------------ + // Rank 5 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (5 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, + extra...) + return m_map.reference(i0, i1, i2, i3, i4); + } + + //------------------------------ + // Rank 6 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (6 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5); + } + + //------------------------------ + // Rank 7 + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (Kokkos::Impl::always_true::value && + (7 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6); + } + + //------------------------------ + // Rank 8 + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map + .m_impl_handle[m_map.m_impl_offset(i0, i1, i2, i3, i4, i5, i6, i7)]; + } + + template + KOKKOS_FORCEINLINE_FUNCTION + std::enable_if_t<(Kokkos::Impl::always_true::value && + (8 == rank) && !is_default_map), + reference_type> + access(I0 i0, I1 i1, I2 i2, I3 i3, I4 i4, I5 i5, I6 i6, I7 i7, + Is... extra) const { + check_access_member_function_valid_args(i0, i1, i2, i3, i4, i5, i6, i7, + extra...); + KOKKOS_IMPL_VIEW_OPERATOR_VERIFY(m_track, m_map, i0, i1, i2, i3, i4, i5, i6, + i7, extra...) + return m_map.reference(i0, i1, i2, i3, i4, i5, i6, i7); + } + +#undef KOKKOS_IMPL_VIEW_OPERATOR_VERIFY + + //---------------------------------------- + // Standard destructor, constructors, and assignment operators + + KOKKOS_DEFAULTED_FUNCTION + ~View() = default; + + KOKKOS_DEFAULTED_FUNCTION + View() = default; + + KOKKOS_FUNCTION + View(const View& other) : m_track(other.m_track), m_map(other.m_map) { + KOKKOS_IF_ON_HOST((hooks_policy::copy_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View(View&& other) + : m_track{std::move(other.m_track)}, m_map{std::move(other.m_map)} { + KOKKOS_IF_ON_HOST((hooks_policy::move_construct(*this, other);)) + } + + KOKKOS_FUNCTION + View& operator=(const View& other) { + m_map = other.m_map; + m_track = other.m_track; + + KOKKOS_IF_ON_HOST((hooks_policy::copy_assign(*this, other);)) + + return *this; + } + + KOKKOS_FUNCTION + View& operator=(View&& other) { + m_map = std::move(other.m_map); + m_track = std::move(other.m_track); + + KOKKOS_IF_ON_HOST((hooks_policy::move_assign(*this, other);)) + + return *this; + } + + //---------------------------------------- + // Compatible view copy constructor and assignment + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View( + const View& rhs, + std::enable_if_t::traits, + typename traits::specialize>::is_assignable_data_type>* = nullptr) + : m_track(rhs), m_map() { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + } + + template + KOKKOS_INLINE_FUNCTION std::enable_if_t< + Kokkos::Impl::ViewMapping< + traits, typename View::traits, + typename traits::specialize>::is_assignable_data_type, + View>& + operator=(const View& rhs) { + using SrcTraits = typename View::traits; + using Mapping = Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, "Incompatible View copy assignment"); + Mapping::assign(m_map, rhs.m_map, rhs.m_track.m_tracker); + m_track.assign(rhs); + return *this; + } + + //---------------------------------------- + // Compatible subview constructor + // may assign unmanaged from managed. + + template + KOKKOS_INLINE_FUNCTION View(const View& src_view, const Arg0 arg0, + Args... args) + : m_track(src_view), m_map() { + using SrcType = View; + + using Mapping = Kokkos::Impl::ViewMapping; + + using DstType = typename Mapping::type; + + static_assert( + Kokkos::Impl::ViewMapping::is_assignable, + "Subview construction requires compatible view and subview arguments"); + + Mapping::assign(m_map, src_view.m_map, arg0, args...); + } + + //---------------------------------------- + // Allocation tracking properties + + KOKKOS_INLINE_FUNCTION + int use_count() const { return m_track.m_tracker.use_count(); } + + inline const std::string label() const { + return m_track.m_tracker + .template get_label(); + } + + public: + //---------------------------------------- + // Allocation according to allocation properties and array layout + + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track(), m_map() { + // Copy the input allocation properties with possibly defaulted properties + // We need to split it in two to avoid MSVC compiler errors + auto prop_copy_tmp = + Impl::with_properties_if_unset(arg_prop, std::string{}); + auto prop_copy = Impl::with_properties_if_unset( + prop_copy_tmp, typename traits::device_type::memory_space{}, + typename traits::device_type::execution_space{}); + using alloc_prop = decltype(prop_copy); + + static_assert(traits::is_managed, + "View allocation constructor requires managed memory"); + + if (alloc_prop::initialize && + !alloc_prop::execution_space::impl_is_initialized()) { + // If initializing view data then + // the execution space must be initialized. + Kokkos::Impl::throw_runtime_exception( + "Constructing View and initializing data with uninitialized " + "execution space"); + } + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + const std::string& alloc_name = + Impl::get_property(prop_copy); + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, alloc_name.c_str()); + } +#endif + + Kokkos::Impl::SharedAllocationRecord<>* record = m_map.allocate_shared( + prop_copy, arg_layout, Impl::ViewCtorProp::has_execution_space); + + // Setup and initialization complete, start tracking + m_track.m_tracker.assign_allocated_record_to_uninitialized(record); + } + + KOKKOS_INLINE_FUNCTION + void assign_data(pointer_type arg_data) { + m_track.m_tracker.clear(); + m_map.assign_data(arg_data); + } + + // Wrap memory according to properties and array layout + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, + typename traits::array_layout> const& arg_layout) + : m_track() // No memory tracking + , + m_map(arg_prop, arg_layout) { + static_assert( + std::is_same::pointer_type>::value, + "Constructing View to wrap user memory must supply matching pointer " + "type"); + +#ifdef KOKKOS_ENABLE_DEBUG_BOUNDS_CHECK + if constexpr (std::is_same_v || + std::is_same_v || + std::is_same_v) { + size_t i0 = arg_layout.dimension[0]; + size_t i1 = arg_layout.dimension[1]; + size_t i2 = arg_layout.dimension[2]; + size_t i3 = arg_layout.dimension[3]; + size_t i4 = arg_layout.dimension[4]; + size_t i5 = arg_layout.dimension[5]; + size_t i6 = arg_layout.dimension[6]; + size_t i7 = arg_layout.dimension[7]; + + Impl::runtime_check_rank( + *this, std::is_same::value, i0, i1, + i2, i3, i4, i5, i6, i7, "UNMANAGED"); + } +#endif + } + + // Simple dimension-only layout + template + explicit inline View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + template + explicit KOKKOS_INLINE_FUNCTION View( + const Impl::ViewCtorProp& arg_prop, + std::enable_if_t::has_pointer, size_t> const + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(arg_prop, + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Allocate with label and layout + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, + typename traits::array_layout> const& arg_layout) + : View(Impl::ViewCtorProp(arg_label), arg_layout) {} + + // Allocate label and layout, must disambiguate from subview constructor. + template + explicit inline View( + const Label& arg_label, + std::enable_if_t::value, const size_t> + arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_label), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + // Construct view from ViewTracker and map + // This should be the preferred method because future extensions may need to + // use the ViewTracker class. + template + KOKKOS_INLINE_FUNCTION View( + const view_tracker_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track.m_tracker); + } + + // Construct View from internal shared allocation tracker object and map + // This is here for backwards compatibility for classes that derive from + // Kokkos::View + template + KOKKOS_INLINE_FUNCTION View( + const typename view_tracker_type::track_type& track, + const Kokkos::Impl::ViewMapping& map) + : m_track(track), m_map() { + using Mapping = + Kokkos::Impl::ViewMapping; + static_assert(Mapping::is_assignable, + "Incompatible View copy construction"); + Mapping::assign(m_map, map, track); + } + + //---------------------------------------- + // Memory span required to wrap these dimensions. + static constexpr size_t required_allocation_size( + typename traits::array_layout const& layout) { + return map_type::memory_span(layout); + } + + static constexpr size_t required_allocation_size( + const size_t arg_N0 = 0, const size_t arg_N1 = 0, const size_t arg_N2 = 0, + const size_t arg_N3 = 0, const size_t arg_N4 = 0, const size_t arg_N5 = 0, + const size_t arg_N6 = 0, const size_t arg_N7 = 0) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + return map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp(arg_ptr), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + explicit KOKKOS_INLINE_FUNCTION View( + pointer_type arg_ptr, const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(arg_ptr), arg_layout) {} + + //---------------------------------------- + // Shared scratch memory constructor + + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(const size_t arg_N0 = KOKKOS_INVALID_INDEX, + const size_t arg_N1 = KOKKOS_INVALID_INDEX, + const size_t arg_N2 = KOKKOS_INVALID_INDEX, + const size_t arg_N3 = KOKKOS_INVALID_INDEX, + const size_t arg_N4 = KOKKOS_INVALID_INDEX, + const size_t arg_N5 = KOKKOS_INVALID_INDEX, + const size_t arg_N6 = KOKKOS_INVALID_INDEX, + const size_t arg_N7 = KOKKOS_INVALID_INDEX) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + const size_t num_passed_args = Impl::count_valid_integers( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7); + + if (std::is_void_v && + num_passed_args != rank_dynamic) { + Kokkos::abort( + "Kokkos::View::shmem_size() rank_dynamic != number of arguments.\n"); + } + + return View::shmem_size(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, arg_N7)); + } + + private: + // Want to be able to align to minimum scratch alignment or sizeof or alignof + // elements + static constexpr size_t scratch_value_alignment = + max({sizeof(typename traits::value_type), + alignof(typename traits::value_type), + static_cast( + traits::execution_space::scratch_memory_space::ALIGN)}); + + public: + static KOKKOS_INLINE_FUNCTION size_t + shmem_size(typename traits::array_layout const& arg_layout) { + return map_type::memory_span(arg_layout) + scratch_value_alignment; + } + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const typename traits::array_layout& arg_layout) + : View(Impl::ViewCtorProp(reinterpret_cast( + arg_space.get_shmem_aligned(map_type::memory_span(arg_layout), + scratch_value_alignment))), + arg_layout) {} + + explicit KOKKOS_INLINE_FUNCTION View( + const typename traits::execution_space::scratch_memory_space& arg_space, + const size_t arg_N0 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N1 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N2 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N3 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N4 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N5 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N6 = KOKKOS_IMPL_CTOR_DEFAULT_ARG, + const size_t arg_N7 = KOKKOS_IMPL_CTOR_DEFAULT_ARG) + : View(Impl::ViewCtorProp( + reinterpret_cast(arg_space.get_shmem_aligned( + map_type::memory_span(typename traits::array_layout( + arg_N0, arg_N1, arg_N2, arg_N3, arg_N4, arg_N5, arg_N6, + arg_N7)), + scratch_value_alignment))), + typename traits::array_layout(arg_N0, arg_N1, arg_N2, arg_N3, + arg_N4, arg_N5, arg_N6, arg_N7)) { + static_assert(traits::array_layout::is_extent_constructible, + "Layout is not constructible from extent arguments. Use " + "overload taking a layout object instead."); + } + + //---------------------------------------- + // MDSpan converting constructors +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN + template ::mdspan_type> + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(traits::is_managed) +#endif + View(const typename Impl::MDSpanViewTraits::mdspan_type& mds, + std::enable_if_t< + !std::is_same_v>* = + nullptr) + : View(mds.data_handle(), + Impl::array_layout_from_mapping< + typename traits::array_layout, + typename Impl::MDSpanViewTraits::mdspan_type>( + mds.mapping())) { + } + + template + KOKKOS_INLINE_FUNCTION +#ifndef KOKKOS_ENABLE_CXX17 + explicit(!std::is_convertible_v< + Kokkos::mdspan, + typename Impl::MDSpanViewTraits::mdspan_type>) +#endif + View(const Kokkos::mdspan& mds) + : View(typename Impl::MDSpanViewTraits::mdspan_type(mds)) { + } + + //---------------------------------------- + // Conversion to MDSpan + template ::mdspan_type, + typename = std::enable_if_t, + std::false_type, + std::is_assignable, + ImplNaturalMDSpanType>>::value>> + KOKKOS_INLINE_FUNCTION constexpr operator mdspan< + OtherElementType, OtherExtents, OtherLayoutPolicy, OtherAccessor>() { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + return mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map)}; + } + + template >, + typename = std::enable_if_t>> + KOKKOS_INLINE_FUNCTION constexpr auto to_mdspan( + const OtherAccessorType& other_accessor = + typename Impl::MDSpanViewTraits::accessor_type()) { + using mdspan_type = typename Impl::MDSpanViewTraits::mdspan_type; + using ret_mdspan_type = + mdspan; + return ret_mdspan_type{data(), + Impl::mapping_from_view_mapping(m_map), + other_accessor}; + } +#endif // KOKKOS_ENABLE_IMPL_MDSPAN +}; + +template +KOKKOS_INLINE_FUNCTION constexpr unsigned rank(const View&) { + return View::rank(); +} + +namespace Impl { + +template +struct RankDataType { + using type = typename RankDataType::type*; +}; + +template +struct RankDataType { + using type = ValueType; +}; + +template +KOKKOS_FUNCTION std::enable_if_t< + N == View::rank() && + std::is_same_v::specialize, void>, + View> +as_view_of_rank_n(View v) { + return v; +} + +// Placeholder implementation to compile generic code for DynRankView; should +// never be called +template +KOKKOS_FUNCTION std::enable_if_t< + N != View::rank() && + std::is_same_v::specialize, void>, + View::value_type, N>::type, + Args...>> +as_view_of_rank_n(View) { + Kokkos::abort("Trying to get at a View of the wrong rank"); + return {}; +} + +template +void apply_to_view_of_static_rank(Function&& f, View a) { + f(a); +} + +} // namespace Impl + +template +KOKKOS_INLINE_FUNCTION auto subview(const View& src, Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +template +KOKKOS_DEPRECATED KOKKOS_INLINE_FUNCTION auto subview(const View& src, + Args... args) { + static_assert(View::rank == sizeof...(Args), + "subview requires one argument for each source View rank"); + static_assert(Kokkos::is_memory_traits::value); + + return typename Kokkos::Impl::ViewMapping< + void /* deduce subview type from source view traits */ + , + typename Impl::RemoveAlignedMemoryTrait::type, + Args...>::type(src, args...); +} +#endif + +template +using Subview = decltype(subview(std::declval(), std::declval()...)); + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +template +KOKKOS_INLINE_FUNCTION bool operator==(const View& lhs, + const View& rhs) { + // Same data, layout, dimensions + using lhs_traits = ViewTraits; + using rhs_traits = ViewTraits; + + return std::is_same_v && + std::is_same_v && + std::is_same_v && + View::rank() == View::rank() && + lhs.data() == rhs.data() && lhs.span() == rhs.span() && + lhs.extent(0) == rhs.extent(0) && lhs.extent(1) == rhs.extent(1) && + lhs.extent(2) == rhs.extent(2) && lhs.extent(3) == rhs.extent(3) && + lhs.extent(4) == rhs.extent(4) && lhs.extent(5) == rhs.extent(5) && + lhs.extent(6) == rhs.extent(6) && lhs.extent(7) == rhs.extent(7); +} + +template +KOKKOS_INLINE_FUNCTION bool operator!=(const View& lhs, + const View& rhs) { + return !(operator==(lhs, rhs)); +} + +} /* namespace Kokkos */ + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { +namespace Impl { + +template +struct CommonViewValueType; + +template +struct CommonViewValueType { + using value_type = std::common_type_t; +}; + +template +struct CommonViewAllocProp; + +template +struct CommonViewAllocProp { + using value_type = ValueType; + using scalar_array_type = ValueType; + + template + KOKKOS_INLINE_FUNCTION CommonViewAllocProp(const Views&...) {} +}; + +template +struct DeduceCommonViewAllocProp; + +// Base case must provide types for: +// 1. specialize 2. value_type 3. is_view 4. prop_type +template +struct DeduceCommonViewAllocProp { + using specialize = typename FirstView::traits::specialize; + + using value_type = typename FirstView::traits::value_type; + + enum : bool { is_view = is_view::value }; + + using prop_type = CommonViewAllocProp; +}; + +template +struct DeduceCommonViewAllocProp { + using NextTraits = DeduceCommonViewAllocProp; + + using first_specialize = typename FirstView::traits::specialize; + using first_value_type = typename FirstView::traits::value_type; + + enum : bool { first_is_view = is_view::value }; + + using next_specialize = typename NextTraits::specialize; + using next_value_type = typename NextTraits::value_type; + + enum : bool { next_is_view = NextTraits::is_view }; + + // common types + + // determine specialize type + // if first and next specialize differ, but are not the same specialize, error + // out + static_assert(!(!std::is_same_v && + !std::is_void_v && + !std::is_void_v), + "Kokkos DeduceCommonViewAllocProp ERROR: Only one non-void " + "specialize trait allowed"); + + // otherwise choose non-void specialize if either/both are non-void + using specialize = + std::conditional_t, + first_specialize, + std::conditional_t<(std::is_void_v && + !std::is_void_v), + next_specialize, first_specialize>>; + + using value_type = typename CommonViewValueType::value_type; + + enum : bool { is_view = (first_is_view && next_is_view) }; + + using prop_type = CommonViewAllocProp; +}; + +} // end namespace Impl + +template +using DeducedCommonPropsType = + typename Impl::DeduceCommonViewAllocProp::prop_type; + +// This function is required in certain scenarios where users customize +// Kokkos View internals. One example are dynamic length embedded ensemble +// types. The function is used to propagate necessary information +// (like the ensemble size) when creating new views. +// However, most of the time it is called with a single view. +// Furthermore, the propagated information is not just for view allocations. +// From what I can tell, the type of functionality provided by +// common_view_alloc_prop is the equivalent of propagating accessors in mdspan, +// a mechanism we will eventually use to replace this clunky approach here, when +// we are finally mdspan based. +// TODO: get rid of this when we have mdspan +template +KOKKOS_INLINE_FUNCTION DeducedCommonPropsType common_view_alloc_prop( + Views const&... views) { + return DeducedCommonPropsType(views...); +} + +} // namespace Kokkos + +#include +#include + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +#endif /* #ifndef KOKKOS_VIEWLEGACY_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp similarity index 90% rename from lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp index 10aaa63b7c..ecc19eaf5e 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewMapping.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewMapping.hpp @@ -28,61 +28,41 @@ #include #include #include -#include -#include -#include +#include +#include +#include +#include #include #include #include -#include +#include #include //---------------------------------------------------------------------------- //---------------------------------------------------------------------------- -namespace Kokkos { - -struct ALL_t { - KOKKOS_INLINE_FUNCTION - constexpr const ALL_t& operator()() const { return *this; } - - KOKKOS_INLINE_FUNCTION - constexpr bool operator==(const ALL_t&) const { return true; } -}; - -#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 -namespace Impl { -// TODO This alias declaration forces us to fully qualify ALL_t inside the -// Kokkos::Impl namespace to avoid deprecation warnings. Replace the -// fully-qualified name when we remove Kokkos::Impl::ALL_t. -using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = - Kokkos::ALL_t; -} // namespace Impl -#endif -} // namespace Kokkos - namespace Kokkos { namespace Impl { template struct is_integral_extent_type { - enum : bool { value = std::is_same::value ? 1 : 0 }; + enum : bool { value = std::is_same_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; // Assuming '2 == initializer_list::size()' template struct is_integral_extent_type> { - enum : bool { value = std::is_integral::value ? 1 : 0 }; + enum : bool { value = std::is_integral_v ? 1 : 0 }; }; template @@ -93,8 +73,7 @@ struct is_integral_extent { enum : bool { value = is_integral_extent_type::value }; - static_assert(value || std::is_integral::value || - std::is_void::value, + static_assert(value || std::is_integral_v || std::is_void_v, "subview argument must be either integral or integral extent"); }; @@ -112,16 +91,16 @@ struct SubviewLegalArgsCompileTime { enum { - value = (((CurrentArg == RankDest - 1) && - (Kokkos::Impl::is_integral_extent_type::value)) || - ((CurrentArg >= RankDest) && (std::is_integral::value)) || - ((CurrentArg < RankDest) && - (std::is_same::value)) || - ((CurrentArg == 0) && - (Kokkos::Impl::is_integral_extent_type::value))) && - (SubviewLegalArgsCompileTime::value) + value = + (((CurrentArg == RankDest - 1) && + (Kokkos::Impl::is_integral_extent_type::value)) || + ((CurrentArg >= RankDest) && (std::is_integral_v)) || + ((CurrentArg < RankDest) && (std::is_same_v)) || + ((CurrentArg == 0) && + (Kokkos::Impl::is_integral_extent_type::value))) && + (SubviewLegalArgsCompileTime::value) }; }; @@ -129,7 +108,7 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankDest - 1) || (std::is_integral::value)) && + value = ((CurrentArg == RankDest - 1) || (std::is_integral_v)) && (CurrentArg == RankSrc - 1) }; }; @@ -144,10 +123,9 @@ struct SubviewLegalArgsCompileTime::value)) || - ((CurrentArg < RankSrc - RankDest) && - (std::is_integral::value)) || + ((CurrentArg < RankSrc - RankDest) && (std::is_integral_v)) || ((CurrentArg >= RankSrc - RankDest) && - (std::is_same::value))) && + (std::is_same_v))) && (SubviewLegalArgsCompileTime::value) @@ -158,8 +136,8 @@ template struct SubviewLegalArgsCompileTime { enum { - value = ((CurrentArg == RankSrc - 1) && - (std::is_same::value)) + value = + ((CurrentArg == RankSrc - 1) && (std::is_same_v)) }; }; @@ -392,7 +370,7 @@ struct SubviewExtents { const int n = snprintf(buffer, LEN, "Kokkos::subview bounds error ("); error(buffer + n, LEN - n, 0, 0, dim, args...); - Kokkos::Impl::throw_runtime_exception(std::string(buffer));)) + Kokkos::abort(buffer);)) KOKKOS_IF_ON_DEVICE(((void)dim; Kokkos::abort("Kokkos::subview bounds error"); @@ -718,8 +696,8 @@ struct ViewOffset< return *this; } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -885,14 +863,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1071,8 +1052,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1086,7 +1067,11 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride(Padding::stride(arg_layout.dimension[0])) {} + m_stride( + arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG + ? arg_layout.stride + : Padding::stride(arg_layout.dimension[0])) { + } template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1407,8 +1392,8 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -1565,14 +1550,17 @@ struct ViewOffset< KOKKOS_INLINE_FUNCTION constexpr array_layout layout() const { constexpr auto r = dimension_type::rank; - return array_layout((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), - (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), - (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), - (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), - (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), - (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), - (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), - (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + array_layout l((r > 0 ? m_dim.N0 : KOKKOS_INVALID_INDEX), + (r > 1 ? m_dim.N1 : KOKKOS_INVALID_INDEX), + (r > 2 ? m_dim.N2 : KOKKOS_INVALID_INDEX), + (r > 3 ? m_dim.N3 : KOKKOS_INVALID_INDEX), + (r > 4 ? m_dim.N4 : KOKKOS_INVALID_INDEX), + (r > 5 ? m_dim.N5 : KOKKOS_INVALID_INDEX), + (r > 6 ? m_dim.N6 : KOKKOS_INVALID_INDEX), + (r > 7 ? m_dim.N7 : KOKKOS_INVALID_INDEX)); + // Without span_is_contiguous Sacado hidden dimensions get messed up + l.stride = span_is_contiguous() ? KOKKOS_IMPL_CTOR_DEFAULT_ARG : m_stride; + return l; } KOKKOS_INLINE_FUNCTION constexpr size_type dimension_0() const { @@ -1614,8 +1602,8 @@ struct ViewOffset< } KOKKOS_INLINE_FUNCTION constexpr bool span_is_contiguous() const { - return m_stride == m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * - m_dim.N2 * m_dim.N1; + return m_stride == static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * + m_dim.N4 * m_dim.N3 * m_dim.N2 * m_dim.N1; } /* Strides of dimensions */ @@ -1624,19 +1612,21 @@ struct ViewOffset< return m_dim.N7; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_5() const { - return m_dim.N7 * m_dim.N6; + return static_cast(m_dim.N7) * m_dim.N6; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_4() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_3() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_2() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_1() const { - return m_dim.N7 * m_dim.N6 * m_dim.N5 * m_dim.N4 * m_dim.N3 * m_dim.N2; + return static_cast(m_dim.N7) * m_dim.N6 * m_dim.N5 * m_dim.N4 * + m_dim.N3 * m_dim.N2; } KOKKOS_INLINE_FUNCTION constexpr size_type stride_0() const { return m_stride; @@ -1749,13 +1739,31 @@ struct ViewOffset< } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif /* Enable padding for trivial scalar types with non-zero trivial scalar size. */ + + private: + template + KOKKOS_FUNCTION constexpr size_type compute_stride( + const Kokkos::LayoutRight& arg_layout) { + if (arg_layout.stride != KOKKOS_IMPL_CTOR_DEFAULT_ARG) + return arg_layout.stride; + size_type value = m_dim.N1; + if constexpr (dimension_type::rank > 2) value *= m_dim.N2; + if constexpr (dimension_type::rank > 3) value *= m_dim.N3; + if constexpr (dimension_type::rank > 4) value *= m_dim.N4; + if constexpr (dimension_type::rank > 5) value *= m_dim.N5; + if constexpr (dimension_type::rank > 6) value *= m_dim.N6; + if constexpr (dimension_type::rank > 7) value *= m_dim.N7; + return Padding::stride(value); + } + + public: template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( std::integral_constant const&, @@ -1764,37 +1772,7 @@ struct ViewOffset< arg_layout.dimension[2], arg_layout.dimension[3], arg_layout.dimension[4], arg_layout.dimension[5], arg_layout.dimension[6], arg_layout.dimension[7]), - m_stride( - Padding:: - stride(/* 2 <= rank */ - m_dim.N1 * - (dimension_type::rank == 2 - ? size_t(1) - : m_dim.N2 * - (dimension_type::rank == 3 - ? size_t(1) - : m_dim.N3 * - (dimension_type::rank == 4 - ? size_t(1) - : m_dim.N4 * - (dimension_type::rank == - 5 - ? size_t(1) - : m_dim.N5 * - (dimension_type:: - rank == - 6 - ? size_t( - 1) - : m_dim.N6 * - (dimension_type:: - rank == - 7 - ? size_t( - 1) - : m_dim - .N7)))))))) { - } + m_stride(compute_stride(arg_layout)) {} template KOKKOS_INLINE_FUNCTION constexpr ViewOffset( @@ -1886,8 +1864,8 @@ struct ViewStride<0> { static constexpr size_t S0 = 0, S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1901,8 +1879,8 @@ struct ViewStride<1> { static constexpr size_t S1 = 0, S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1916,8 +1894,8 @@ struct ViewStride<2> { size_t S0, S1; static constexpr size_t S2 = 0, S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1931,8 +1909,8 @@ struct ViewStride<3> { size_t S0, S1, S2; static constexpr size_t S3 = 0, S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1946,8 +1924,8 @@ struct ViewStride<4> { size_t S0, S1, S2, S3; static constexpr size_t S4 = 0, S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1961,8 +1939,8 @@ struct ViewStride<5> { size_t S0, S1, S2, S3, S4; static constexpr size_t S5 = 0, S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1976,8 +1954,8 @@ struct ViewStride<6> { size_t S0, S1, S2, S3, S4, S5; static constexpr size_t S6 = 0, S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -1991,8 +1969,8 @@ struct ViewStride<7> { size_t S0, S1, S2, S3, S4, S5, S6; static constexpr size_t S7 = 0; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2005,8 +1983,8 @@ template <> struct ViewStride<8> { size_t S0, S1, S2, S3, S4, S5, S6, S7; - ViewStride() = default; - ViewStride(const ViewStride&) = default; + ViewStride() = default; + ViewStride(const ViewStride&) = default; ViewStride& operator=(const ViewStride&) = default; KOKKOS_INLINE_FUNCTION @@ -2283,8 +2261,8 @@ struct ViewOffset { } #else - ViewOffset() = default; - ViewOffset(const ViewOffset&) = default; + ViewOffset() = default; + ViewOffset(const ViewOffset&) = default; ViewOffset& operator=(const ViewOffset&) = default; #endif @@ -2398,9 +2376,9 @@ struct ViewDataHandle { template struct ViewDataHandle< Traits, - std::enable_if_t<(std::is_same::value && - std::is_void::value && + std::enable_if_t<(std::is_same_v && + std::is_void_v && Traits::memory_traits::is_atomic)>> { using value_type = typename Traits::value_type; using handle_type = typename Kokkos::Impl::AtomicViewDataHandle; @@ -2422,11 +2400,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - (!Traits::memory_traits::is_aligned) && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + (!Traits::memory_traits::is_aligned) && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; using handle_type = typename Traits::value_type* KOKKOS_RESTRICT; using return_type = typename Traits::value_type& KOKKOS_RESTRICT; @@ -2446,11 +2423,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - (!Traits::memory_traits::is_restrict) && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + (!Traits::memory_traits::is_restrict) && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2485,11 +2461,10 @@ struct ViewDataHandle< template struct ViewDataHandle< - Traits, - std::enable_if_t<(std::is_void::value && - Traits::memory_traits::is_aligned && - Traits::memory_traits::is_restrict && - (!Traits::memory_traits::is_atomic))>> { + Traits, std::enable_if_t<(std::is_void_v && + Traits::memory_traits::is_aligned && + Traits::memory_traits::is_restrict && + (!Traits::memory_traits::is_atomic))>> { using value_type = typename Traits::value_type; // typedef work-around for intel compilers error #3186: expected typedef // declaration @@ -2533,11 +2508,10 @@ namespace Impl { /** \brief View mapping for non-specialized data type and standard layout */ template class ViewMapping< - Traits, - std::enable_if_t<( - std::is_void::value && - ViewOffset::is_mapping_plugin::value)>> { + Traits, std::enable_if_t<(std::is_void_v && + ViewOffset::is_mapping_plugin::value)>> { public: using offset_type = ViewOffset; @@ -2680,28 +2654,26 @@ class ViewMapping< reference_type reference() const { return m_impl_handle[0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if layout is neither stride nor irregular, - // then just use the handle directly - !(std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if layout is neither stride nor irregular, + // then just use the handle directly + !(std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[i0]; } template - KOKKOS_FORCEINLINE_FUNCTION - std::enable_if_t<(std::is_integral::value && - // if the layout is strided or irregular, then - // we have to use the offset - (std::is_same::value || - !is_regular::value)), - reference_type> - reference(const I0& i0) const { + KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< + (std::is_integral_v && + // if the layout is strided or irregular, then + // we have to use the offset + (std::is_same_v || + !is_regular::value)), + reference_type> + reference(const I0& i0) const { return m_impl_handle[m_impl_offset(i0)]; } @@ -2780,7 +2752,7 @@ class ViewMapping< KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(const ViewMapping&) = default; - KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; + KOKKOS_DEFAULTED_FUNCTION ViewMapping(ViewMapping&&) = default; KOKKOS_DEFAULTED_FUNCTION ViewMapping& operator=(ViewMapping&&) = default; //---------------------------------------- @@ -2894,29 +2866,34 @@ template class ViewMapping< DstTraits, SrcTraits, std::enable_if_t<( - !(std::is_same:: - value) && // Added to have a new specialization for SrcType of - // LayoutStride + !(std::is_same_v)&& // Added to have a new + // specialization for + // SrcType of + // LayoutStride // default mappings - std::is_void::value && - std::is_void::value && + std::is_void_v && + std::is_void_v && ( // same layout - std::is_same::value || + std::is_same_v || // known layout - ((std::is_same::value || - std::is_same::value || - std::is_same::value) && - (std::is_same::value || - std::is_same::value || - std::is_same::value))))>> { + ((std::is_same_v || + std::is_same_v || + std::is_same_v< + typename DstTraits::array_layout, + Kokkos::LayoutStride>)&&(std::is_same_v || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutRight> || + std::is_same_v< + typename SrcTraits::array_layout, + Kokkos::LayoutStride>))))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -2926,10 +2903,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -2939,12 +2916,12 @@ class ViewMapping< }; enum { - is_assignable_layout = - std::is_same::value || - std::is_same::value || - (DstTraits::dimension::rank == 0) || (DstTraits::dimension::rank == 1) + is_assignable_layout = std::is_same_v || + std::is_same_v || + (DstTraits::dimension::rank == 0) || + (DstTraits::dimension::rank == 1) }; public: @@ -3032,22 +3009,21 @@ class ViewMapping< template class ViewMapping< DstTraits, SrcTraits, - std::enable_if_t<( - std::is_same::value && - std::is_void::value && - std::is_void::value && - ( - // same layout - std::is_same::value || - // known layout - (std::is_same::value || - std::is_same::value || - std::is_same::value)))>> { + std::enable_if_t<(std::is_same_v && + std::is_void_v && + std::is_void_v && + ( + // same layout + std::is_same_v || + // known layout + (std::is_same_v || + std::is_same_v || + std::is_same_v)))>> { private: enum { is_assignable_space = Kokkos::Impl::MemorySpaceAccess< @@ -3057,10 +3033,10 @@ class ViewMapping< enum { is_assignable_value_type = - std::is_same::value || - std::is_same::value + std::is_same_v || + std::is_same_v }; enum { @@ -3091,8 +3067,7 @@ class ViewMapping< bool assignable = true; src.stride(strides); size_t exp_stride = 1; - if (std::is_same::value) { + if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(i - 1); if (strides[i] != exp_stride) { @@ -3100,8 +3075,8 @@ class ViewMapping< break; } } - } else if (std::is_same::value) { + } else if (std::is_same_v) { for (int i = 0; i < (int)src.Rank; i++) { if (i > 0) exp_stride *= src.extent(src.Rank - i); if (strides[src.Rank - 1 - i] != exp_stride) { @@ -3197,8 +3172,8 @@ struct SubViewDataTypeImpl> { template struct SubViewDataTypeImpl< - std::enable_if_t>::value>, - ValueType, Kokkos::Experimental::Extents, Integral, Args...> + std::enable_if_t>>, ValueType, + Kokkos::Experimental::Extents, Integral, Args...> : SubViewDataTypeImpl, Args...> {}; @@ -3230,13 +3205,13 @@ struct SubViewDataType : SubViewDataTypeImpl {}; template class ViewMapping< - std::enable_if_t<(std::is_void::value && - (std::is_same::value || - std::is_same::value || - std::is_same::value))>, + std::enable_if_t<( + std::is_void_v && + (std::is_same_v || + std::is_same_v || + std::is_same_v))>, SrcTraits, Args...> { private: static_assert(SrcTraits::rank == sizeof...(Args), @@ -3292,14 +3267,14 @@ class ViewMapping< // OutputRank 1 or 2, InputLayout Left, Interval 0 // because single stride one or second index has a stride. (rank <= 2 && R0 && - std::is_same::value) // replace with input rank + std::is_same_v) // replace with input rank || // OutputRank 1 or 2, InputLayout Right, Interval [InputRank-1] // because single stride one or second index has a stride. (rank <= 2 && R0_rev && - std::is_same::value) // replace input rank + std::is_same_v) // replace input rank ), typename SrcTraits::array_layout, Kokkos::LayoutStride>; diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp similarity index 100% rename from lib/kokkos/core/src/impl/Kokkos_ViewTracker.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewTracker.hpp diff --git a/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp new file mode 100644 index 0000000000..5eddfc68e0 --- /dev/null +++ b/lib/kokkos/core/src/View/Kokkos_ViewTraits.hpp @@ -0,0 +1,457 @@ +//@HEADER +// ************************************************************************ +// +// Kokkos v. 4.0 +// Copyright (2022) National Technology & Engineering +// Solutions of Sandia, LLC (NTESS). +// +// Under the terms of Contract DE-NA0003525 with NTESS, +// the U.S. Government retains certain rights in this software. +// +// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. +// See https://kokkos.org/LICENSE for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//@HEADER + +#ifndef KOKKOS_IMPL_PUBLIC_INCLUDE +#include +static_assert(false, + "Including non-public Kokkos header files is not allowed."); +#endif +#ifndef KOKKOS_VIEWTRAITS_HPP +#define KOKKOS_VIEWTRAITS_HPP + +#include +#include +#include +#include +#include +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +#include +#include +#endif + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Kokkos { + +struct ALL_t { + KOKKOS_FUNCTION + constexpr const ALL_t& operator()() const { return *this; } + + KOKKOS_FUNCTION + constexpr bool operator==(const ALL_t&) const { return true; } +}; + +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 +namespace Impl { +// TODO This alias declaration forces us to fully qualify ALL_t inside the +// Kokkos::Impl namespace to avoid deprecation warnings. Replace the +// fully-qualified name when we remove Kokkos::Impl::ALL_t. +using ALL_t KOKKOS_DEPRECATED_WITH_COMMENT("Use Kokkos::ALL_t instead!") = + Kokkos::ALL_t; +} // namespace Impl +#endif + +// FIXME_OPENMPTARGET - The `declare target` is needed for the Intel GPUs with +// the OpenMPTarget backend +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp declare target +#endif + +inline constexpr Kokkos::ALL_t ALL{}; + +#if defined(KOKKOS_ENABLE_OPENMPTARGET) && defined(KOKKOS_COMPILER_INTEL_LLVM) +#pragma omp end declare target +#endif + +namespace Impl { + +template +struct ViewArrayAnalysis; + +template ::non_const_value_type> +struct ViewDataAnalysis; + +template +class ViewMapping { + public: + enum : bool { is_assignable_data_type = false }; + enum : bool { is_assignable = false }; +}; + +template +constexpr KOKKOS_INLINE_FUNCTION std::size_t count_valid_integers( + const IntType i0, const IntType i1, const IntType i2, const IntType i3, + const IntType i4, const IntType i5, const IntType i6, const IntType i7) { + static_assert(std::is_integral_v, + "count_valid_integers() must have integer arguments."); + + return (i0 != KOKKOS_INVALID_INDEX) + (i1 != KOKKOS_INVALID_INDEX) + + (i2 != KOKKOS_INVALID_INDEX) + (i3 != KOKKOS_INVALID_INDEX) + + (i4 != KOKKOS_INVALID_INDEX) + (i5 != KOKKOS_INVALID_INDEX) + + (i6 != KOKKOS_INVALID_INDEX) + (i7 != KOKKOS_INVALID_INDEX); +} + +// FIXME Ideally, we would not instantiate this function for every possible View +// type. We should be able to only pass "extent" when we use mdspan. +template +KOKKOS_INLINE_FUNCTION void runtime_check_rank( + const View&, const bool is_void_spec, const size_t i0, const size_t i1, + const size_t i2, const size_t i3, const size_t i4, const size_t i5, + const size_t i6, const size_t i7, const char* label) { + (void)(label); + + if (is_void_spec) { + const size_t num_passed_args = + count_valid_integers(i0, i1, i2, i3, i4, i5, i6, i7); + // We either allow to pass as many extents as the dynamic rank is, or + // as many extents as the total rank is. In the latter case, the given + // extents for the static dimensions must match the + // compile-time extents. + constexpr int rank = View::rank(); + constexpr int dyn_rank = View::rank_dynamic(); + const bool n_args_is_dyn_rank = num_passed_args == dyn_rank; + const bool n_args_is_rank = num_passed_args == rank; + + if constexpr (rank != dyn_rank) { + if (n_args_is_rank) { + size_t new_extents[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; + for (int i = dyn_rank; i < rank; ++i) + if (new_extents[i] != View::static_extent(i)) { + KOKKOS_IF_ON_HOST( + const std::string message = + "The specified run-time extent for Kokkos::View '" + + std::string(label) + + "' does not match the compile-time extent in dimension " + + std::to_string(i) + ". The given extent is " + + std::to_string(new_extents[i]) + " but should be " + + std::to_string(View::static_extent(i)) + ".\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE( + Kokkos::abort( + "The specified run-time extents for a Kokkos::View " + "do not match the compile-time extents.");) + } + } + } + + if (!n_args_is_dyn_rank && !n_args_is_rank) { + KOKKOS_IF_ON_HOST( + const std::string message = + "Constructor for Kokkos::View '" + std::string(label) + + "' has mismatched number of arguments. The number " + "of arguments = " + + std::to_string(num_passed_args) + + " neither matches the dynamic rank = " + + std::to_string(dyn_rank) + + " nor the total rank = " + std::to_string(rank) + "\n"; + Kokkos::abort(message.c_str());) + KOKKOS_IF_ON_DEVICE(Kokkos::abort("Constructor for Kokkos View has " + "mismatched number of arguments.");) + } + } +} + +} /* namespace Impl */ +} /* namespace Kokkos */ + +// Class to provide a uniform type +namespace Kokkos { +namespace Impl { +template +struct ViewUniformType; +} +} // namespace Kokkos + +namespace Kokkos { + +#ifdef KOKKOS_ENABLE_IMPL_MDSPAN +namespace Impl { +struct UnsupportedKokkosArrayLayout; + +template +struct MDSpanViewTraits { + using mdspan_type = UnsupportedKokkosArrayLayout; +}; + +// "Natural" mdspan for a view if the View's ArrayLayout is supported. +template +struct MDSpanViewTraits::type>> { + using index_type = std::size_t; + using extents_type = + typename Impl::ExtentsFromDataType::type; + using mdspan_layout_type = + typename LayoutFromArrayLayout::type; + using accessor_type = + SpaceAwareAccessor>; + using mdspan_type = mdspan; +}; +} // namespace Impl +#endif // KOKKOS_ENABLE_IMPL_MDSPAN + +/** \class ViewTraits + * \brief Traits class for accessing attributes of a View. + * + * This is an implementation detail of View. It is only of interest + * to developers implementing a new specialization of View. + * + * Template argument options: + * - View< DataType > + * - View< DataType , Space > + * - View< DataType , Space , MemoryTraits > + * - View< DataType , ArrayLayout > + * - View< DataType , ArrayLayout , Space > + * - View< DataType , ArrayLayout , MemoryTraits > + * - View< DataType , ArrayLayout , Space , MemoryTraits > + * - View< DataType , MemoryTraits > + */ + +template +struct ViewTraits; + +template <> +struct ViewTraits { + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = void; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + // Ignore an extraneous 'void' + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + HooksPolicy, Prop...> { + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = typename ViewTraits::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = HooksPolicy; +}; + +template +struct ViewTraits::value>, + ArrayLayout, Prop...> { + // Specify layout, keep subsequent space and memory traits arguments + + using execution_space = typename ViewTraits::execution_space; + using memory_space = typename ViewTraits::memory_space; + using HostMirrorSpace = typename ViewTraits::HostMirrorSpace; + using array_layout = ArrayLayout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits::value>, Space, + Prop...> { + // Specify Space, memory traits should be the only subsequent argument. + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::HostMirrorSpace, + void> && + std::is_same_v::array_layout, + void>, + "Only one View Execution or Memory Space template argument"); + + using execution_space = typename Space::execution_space; + using memory_space = typename Space::memory_space; + using HostMirrorSpace = + typename Kokkos::Impl::HostMirror::Space::memory_space; + using array_layout = typename execution_space::array_layout; + using memory_traits = typename ViewTraits::memory_traits; + using specialize = typename ViewTraits::specialize; + using hooks_policy = typename ViewTraits::hooks_policy; +}; + +template +struct ViewTraits< + std::enable_if_t::value>, + MemoryTraits, Prop...> { + // Specify memory trait, should not be any subsequent arguments + + static_assert( + std::is_same_v::execution_space, + void> && + std::is_same_v::memory_space, + void> && + std::is_same_v::array_layout, + void> && + std::is_same_v::memory_traits, + void> && + std::is_same_v::hooks_policy, + void>, + "MemoryTrait is the final optional template argument for a View"); + + using execution_space = void; + using memory_space = void; + using HostMirrorSpace = void; + using array_layout = void; + using memory_traits = MemoryTraits; + using specialize = void; + using hooks_policy = void; +}; + +template +struct ViewTraits { + private: + // Unpack the properties arguments + using prop = ViewTraits; + + using ExecutionSpace = + std::conditional_t, + typename prop::execution_space, + Kokkos::DefaultExecutionSpace>; + + using MemorySpace = + std::conditional_t, + typename prop::memory_space, + typename ExecutionSpace::memory_space>; + + using ArrayLayout = + std::conditional_t, + typename prop::array_layout, + typename ExecutionSpace::array_layout>; + + using HostMirrorSpace = std::conditional_t< + !std::is_void_v, + typename prop::HostMirrorSpace, + typename Kokkos::Impl::HostMirror::Space>; + + using MemoryTraits = + std::conditional_t, + typename prop::memory_traits, + typename Kokkos::MemoryManaged>; + + using HooksPolicy = + std::conditional_t, + typename prop::hooks_policy, + Kokkos::Experimental::DefaultViewHooks>; + + // Analyze data type's properties, + // May be specialized based upon the layout and value type + using data_analysis = Kokkos::Impl::ViewDataAnalysis; + + public: + //------------------------------------ + // Data type traits: + + using data_type = typename data_analysis::type; + using const_data_type = typename data_analysis::const_type; + using non_const_data_type = typename data_analysis::non_const_type; + + //------------------------------------ + // Compatible array of trivial type traits: + + using scalar_array_type = typename data_analysis::scalar_array_type; + using const_scalar_array_type = + typename data_analysis::const_scalar_array_type; + using non_const_scalar_array_type = + typename data_analysis::non_const_scalar_array_type; + + //------------------------------------ + // Value type traits: + + using value_type = typename data_analysis::value_type; + using const_value_type = typename data_analysis::const_value_type; + using non_const_value_type = typename data_analysis::non_const_value_type; + + //------------------------------------ + // Mapping traits: + + using array_layout = ArrayLayout; + using dimension = typename data_analysis::dimension; + + using specialize = std::conditional_t< + std::is_void_v, + typename prop::specialize, + typename data_analysis::specialize>; /* mapping specialization tag */ + + static constexpr unsigned rank = dimension::rank; + static constexpr unsigned rank_dynamic = dimension::rank_dynamic; + + //------------------------------------ + // Execution space, memory space, memory access traits, and host mirror space. + + using execution_space = ExecutionSpace; + using memory_space = MemorySpace; + using device_type = Kokkos::Device; + using memory_traits = MemoryTraits; + using host_mirror_space = HostMirrorSpace; + using hooks_policy = HooksPolicy; + + using size_type = typename MemorySpace::size_type; + + enum { is_hostspace = std::is_same_v }; + enum { is_managed = MemoryTraits::is_unmanaged == 0 }; + enum { is_random_access = MemoryTraits::is_random_access == 1 }; + + //------------------------------------ +}; + +//---------------------------------------------------------------------------- +//---------------------------------------------------------------------------- + +namespace Impl { +template +struct TypeListToViewTraits; + +template +struct TypeListToViewTraits> { + using type = ViewTraits; +}; + +// It is not safe to assume that subviews of views with the Aligned memory trait +// are also aligned. Hence, just remove that attribute for subviews. +template +struct RemoveAlignedMemoryTrait { + private: + using type_list_in = Kokkos::Impl::type_list; + using memory_traits = typename ViewTraits::memory_traits; + using type_list_in_wo_memory_traits = + typename Kokkos::Impl::type_list_remove_first::type; + using new_memory_traits = + Kokkos::MemoryTraits; + using new_type_list = typename Kokkos::Impl::concat_type_list< + type_list_in_wo_memory_traits, + Kokkos::Impl::type_list>::type; + + public: + using type = typename TypeListToViewTraits::type; +}; +} // namespace Impl + +} /* namespace Kokkos */ + +#endif /* KOKKOS_VIEWTRAITS_HPP */ diff --git a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp similarity index 88% rename from lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp rename to lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp index 7de2869a0d..1e47613285 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ViewUniformType.hpp +++ b/lib/kokkos/core/src/View/Kokkos_ViewUniformType.hpp @@ -24,11 +24,14 @@ namespace Impl { template struct ViewScalarToDataType { using type = typename ViewScalarToDataType::type *; + using const_type = + typename ViewScalarToDataType::const_type *; }; template struct ViewScalarToDataType { - using type = ScalarType; + using type = ScalarType; + using const_type = const ScalarType; }; template @@ -49,12 +52,13 @@ struct ViewUniformLayout { template struct ViewUniformType { using data_type = typename ViewType::data_type; - using const_data_type = std::add_const_t; + using const_data_type = typename ViewType::const_data_type; using runtime_data_type = typename ViewScalarToDataType::type; - using runtime_const_data_type = typename ViewScalarToDataType< - std::add_const_t, ViewType::rank>::type; + using runtime_const_data_type = + typename ViewScalarToDataType::const_type; using array_layout = typename ViewUniformLayout { } KOKKOS_FUNCTION - constexpr typename offset_policy::data_handle_type offset(data_handle_type p, - size_t i) const - noexcept { + constexpr typename offset_policy::data_handle_type offset( + data_handle_type p, size_t i) const noexcept { return nested_acc.offset(p, i); } @@ -214,6 +212,199 @@ struct AtomicAccessorRelaxed { } }; +//===================================================================== +//============= Reference Counted Accessor and DataHandle ============= +//===================================================================== + +template +class ReferenceCountedDataHandle { + public: + using value_type = ElementType; + using pointer = value_type*; + using reference = value_type&; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + ReferenceCountedDataHandle() = default; + + // this only ever works on host + explicit ReferenceCountedDataHandle(SharedAllocationRecord* rec) { + m_tracker.assign_allocated_record_to_uninitialized(rec); + m_handle = static_cast(get_record()->data()); + } + + KOKKOS_FUNCTION + ReferenceCountedDataHandle(const SharedAllocationTracker& tracker, + pointer data_handle) + : m_tracker(tracker), m_handle(data_handle) {} + + // unmanaged ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle(OtherElementType* ptr) + : m_tracker(), m_handle(ptr) {} + + // subview ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other, OtherElementType* ptr) + : m_tracker(other.m_tracker), m_handle(ptr) {} + + // converting ctor + template >> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + template < + class OtherElementType, class OtherSpace, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)>> + KOKKOS_FUNCTION ReferenceCountedDataHandle( + const ReferenceCountedDataHandle& other) + : m_tracker(other.m_tracker), m_handle(other.m_handle) {} + + KOKKOS_FUNCTION + pointer get() const noexcept { return m_handle; } + KOKKOS_FUNCTION + explicit operator pointer() const noexcept { return m_handle; } + + bool has_record() const { return m_tracker.has_record(); } + auto* get_record() const { return m_tracker.get_record(); } + int use_count() const noexcept { return m_tracker.use_count(); } + + std::string get_label() const { return m_tracker.get_label(); } + KOKKOS_FUNCTION + const SharedAllocationTracker& tracker() const noexcept { return m_tracker; } + + KOKKOS_FUNCTION + friend bool operator==(const ReferenceCountedDataHandle& lhs, + const value_type* rhs) { + return lhs.m_handle == rhs; + } + + KOKKOS_FUNCTION + friend bool operator==(const value_type* lhs, + const ReferenceCountedDataHandle& rhs) { + return lhs == rhs.m_handle; + } + + private: + template + friend class ReferenceCountedDataHandle; + + template + friend class ReferenceCountedAccessor; + + SharedAllocationTracker m_tracker; + pointer m_handle = nullptr; +}; + +template +class ReferenceCountedAccessor; + +template +struct IsReferenceCountedAccessor : std::false_type {}; + +template +struct IsReferenceCountedAccessor< + ReferenceCountedAccessor> + : std::true_type {}; + +template +class ReferenceCountedAccessor { + public: + using element_type = ElementType; + using data_handle_type = ReferenceCountedDataHandle; + using reference = typename NestedAccessor::reference; + using offset_policy = + ReferenceCountedAccessor; + using memory_space = MemorySpace; + + KOKKOS_DEFAULTED_FUNCTION + constexpr ReferenceCountedAccessor() noexcept = default; + + template < + class OtherElementType, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + std::is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template < + class OtherElementType, class OtherSpace, class OtherNestedAccessor, + class = std::enable_if_t< + std::is_convertible_v && + (std::is_same_v || + std::is_same_v)&&std:: + is_constructible_v>> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const ReferenceCountedAccessor&) {} + + template >> + KOKKOS_FUNCTION constexpr ReferenceCountedAccessor( + const default_accessor&) {} + + template ::value && + std::is_convertible_v>> + KOKKOS_FUNCTION operator DstAccessor() const { + return m_nested_acc; + } + + KOKKOS_FUNCTION + constexpr reference access(data_handle_type p, size_t i) const { + return m_nested_acc.access(p.get(), i); + } + + KOKKOS_FUNCTION + constexpr data_handle_type offset(data_handle_type p, size_t i) const { + return data_handle_type(p, m_nested_acc.offset(p.get(), i)); + } + + KOKKOS_FUNCTION + constexpr auto nested_accessor() const { return m_nested_acc; } + + private: +#ifdef _MDSPAN_NO_UNIQUE_ADDRESS + _MDSPAN_NO_UNIQUE_ADDRESS +#else + [[no_unique_address]] +#endif + NestedAccessor m_nested_acc; +}; + +template +using CheckedReferenceCountedAccessor = + SpaceAwareAccessor>>; + +template +using CheckedRelaxedAtomicAccessor = + SpaceAwareAccessor>; + +template +using CheckedReferenceCountedRelaxedAtomicAccessor = SpaceAwareAccessor< + MemorySpace, ReferenceCountedAccessor>>; + } // namespace Impl } // namespace Kokkos diff --git a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp index 089628137d..f990d158bf 100644 --- a/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp +++ b/lib/kokkos/core/src/View/MDSpan/Kokkos_MDSpan_Layout.hpp @@ -23,7 +23,11 @@ static_assert(false, #define KOKKOS_EXPERIMENTAL_MDSPAN_LAYOUT_HPP #include "Kokkos_MDSpan_Extents.hpp" -#include +#include + +// The difference between a legacy Kokkos array layout and an +// mdspan layout is that the array layouts can have state, but don't have the +// nested mapping. This file provides interoperability helpers. namespace Kokkos::Impl { @@ -77,32 +81,7 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 7 ? mapping.stride(7) : 0, }; } else { - // FIXME: Kokkos Layouts don't store stride (it's in the mapping) - // We could conceivably fix this by adding an extra ViewCtorProp for - // an abritrary padding. For now we will check for this. - if constexpr (rank > 1 && - (std::is_same_v> || - std::is_same_v>)) { - [[maybe_unused]] constexpr size_t strided_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 1 - : rank - 2; - [[maybe_unused]] constexpr size_t extent_index = - std::is_same_v< - typename mapping_type::layout_type, - Kokkos::Experimental::layout_left_padded> - ? 0 - : rank - 1; - KOKKOS_ASSERT(mapping.stride(strided_index) == ext.extent(extent_index)); - } - - return ArrayLayout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, + ArrayLayout layout{rank > 0 ? ext.extent(0) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 1 ? ext.extent(1) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 2 ? ext.extent(2) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 3 ? ext.extent(3) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, @@ -110,12 +89,98 @@ KOKKOS_INLINE_FUNCTION auto array_layout_from_mapping( rank > 5 ? ext.extent(5) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 6 ? ext.extent(6) : KOKKOS_IMPL_CTOR_DEFAULT_ARG, rank > 7 ? ext.extent(7) : KOKKOS_IMPL_CTOR_DEFAULT_ARG}; + + if constexpr (rank > 1 && + std::is_same_v>) { + layout.stride = mapping.stride(1); + } + if constexpr (std::is_same_v>) { + if constexpr (rank == 2) { + layout.stride = mapping.stride(0); + } + if constexpr (rank > 2) { + if (mapping.stride(rank - 2) != mapping.extents().extent(rank - 1)) + Kokkos::abort( + "Invalid conversion from layout_right_padded to LayoutRight"); + } + } + return layout; } #ifdef KOKKOS_COMPILER_INTEL __builtin_unreachable(); #endif } +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + ArrayLayout layout, std::index_sequence) { + using index_type = typename MappingType::index_type; + using extents_type = typename MappingType::extents_type; + if constexpr (std::is_same_v || + std::is_same_v) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if (layout.stride == KOKKOS_IMPL_CTOR_DEFAULT_ARG || + extents_type::rank() < 2) { + return MappingType{ + extents_type{dextents{ + layout.dimension[Idx]...}}}; + } else { + if constexpr (std::is_same_v && + extents_type::rank() > 2) { + size_t product_of_dimensions = 1; + for (size_t r = 1; r < extents_type::rank(); r++) + product_of_dimensions *= layout.dimension[r]; + if (product_of_dimensions != layout.stride) + Kokkos::abort( + "Invalid conversion from LayoutRight to layout_right_padded"); + } else { + return MappingType{ + extents_type{ + dextents{ + layout.dimension[Idx]...}}, + layout.stride}; + } + } + } +} +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride layout, std::index_sequence) { + static_assert( + std::is_same_v); + using index_type = typename MappingType::index_type; + index_type strides[MappingType::extents_type::rank()] = { + layout.stride[Idx]...}; + return MappingType{ + mdspan_non_standard_tag(), + static_cast( + dextents{ + layout.dimension[Idx]...}), + strides}; +} + +// specialization for rank 0 to avoid empty array +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout_impl( + LayoutStride, std::index_sequence<>) { + return MappingType{}; +} + +template +KOKKOS_INLINE_FUNCTION auto mapping_from_array_layout(ArrayLayout layout) { + return mapping_from_array_layout_impl( + layout, std::make_index_sequence()); +} + template KOKKOS_INLINE_FUNCTION auto mapping_from_view_mapping(const VM &view_mapping) { using mapping_type = typename MDSpanType::mapping_type; diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp index ebdf2c8211..79c137bfdd 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_CUDA.hpp @@ -28,7 +28,9 @@ #include #include #include +#ifdef KOKKOS_ENABLE_DEPRECATED_CODE_4 #include +#endif #include #include #include diff --git a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp index d13c90825c..3570ed2b6e 100644 --- a/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp +++ b/lib/kokkos/core/src/decl/Kokkos_Declare_SYCL.hpp @@ -35,6 +35,16 @@ #include #include #include + +namespace Kokkos { +namespace Experimental { +using SYCLDeviceUSMSpace = ::Kokkos::SYCLDeviceUSMSpace; +using SYCLHostUSMSpace = ::Kokkos::SYCLHostUSMSpace; +using SYCLSharedUSMSpace = ::Kokkos::SYCLSharedUSMSpace; +using SYCL = ::Kokkos::SYCL; +} // namespace Experimental +} // namespace Kokkos + #endif #endif diff --git a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp index 400794f865..399b986041 100644 --- a/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp +++ b/lib/kokkos/core/src/fwd/Kokkos_Fwd_SYCL.hpp @@ -19,7 +19,6 @@ #if defined(KOKKOS_ENABLE_SYCL) namespace Kokkos { -namespace Experimental { class SYCLDeviceUSMSpace; ///< Memory space on SYCL device, not accessible from ///< the host class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL @@ -27,7 +26,6 @@ class SYCLSharedUSMSpace; ///< Memory space accessible from both the SYCL class SYCLHostUSMSpace; ///< Memory space accessible from both the SYCL ///< device and the host (host pinned) class SYCL; ///< Execution space for SYCL -} // namespace Experimental } // namespace Kokkos #endif #endif diff --git a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp index a44ffefa6b..a9db2c4cf4 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_Host_IterateTile.hpp @@ -1458,7 +1458,7 @@ struct Tile_Loop_Type<8, IsLeft, IType, void, void> { template struct Tile_Loop_Type<1, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1477,7 +1477,7 @@ struct Tile_Loop_Type<1, IsLeft, IType, Tagged, template struct Tile_Loop_Type<2, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1496,7 +1496,7 @@ struct Tile_Loop_Type<2, IsLeft, IType, Tagged, template struct Tile_Loop_Type<3, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1515,7 +1515,7 @@ struct Tile_Loop_Type<3, IsLeft, IType, Tagged, template struct Tile_Loop_Type<4, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1534,7 +1534,7 @@ struct Tile_Loop_Type<4, IsLeft, IType, Tagged, template struct Tile_Loop_Type<5, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1553,7 +1553,7 @@ struct Tile_Loop_Type<5, IsLeft, IType, Tagged, template struct Tile_Loop_Type<6, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1572,7 +1572,7 @@ struct Tile_Loop_Type<6, IsLeft, IType, Tagged, template struct Tile_Loop_Type<7, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1591,7 +1591,7 @@ struct Tile_Loop_Type<7, IsLeft, IType, Tagged, template struct Tile_Loop_Type<8, IsLeft, IType, Tagged, - std::enable_if_t::value>> { + std::enable_if_t>> { template static void apply(Func const& func, bool cond, Offset const& offset, ExtentA const& a, ExtentB const& b) { @@ -1616,7 +1616,7 @@ struct HostIterateTile; // For ParallelFor template struct HostIterateTile::value>> { + std::enable_if_t>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -1635,12 +1635,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2000,30 +1999,28 @@ struct HostIterateTile - std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && std::is_void_v), void> apply(Args&&... args) const { m_func(args...); } template - std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void::value), - void> + std::enable_if_t<(sizeof...(Args) == RP::rank && !std::is_void_v), void> apply(Args&&... args) const { m_func(m_tag, args...); } RP const m_rp; Functor const m_func; - std::conditional_t::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce // ValueType - scalar: For reductions template struct HostIterateTile::value && - !std::is_array::value>> { + std::enable_if_t && + !std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2050,12 +2047,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2430,7 +2426,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // For ParallelReduce @@ -2438,8 +2434,8 @@ struct HostIterateTile struct HostIterateTile::value && - std::is_array::value>> { + std::enable_if_t && + std::is_array_v>> { using index_type = typename RP::index_type; using point_type = typename RP::point_type; @@ -2463,12 +2459,11 @@ struct HostIterateTile 0 - ? (m_rp.m_upper[i] - offset[i]) - : (m_rp.m_upper[i] - - m_rp.m_lower[i]); // when single tile encloses range + (m_rp.m_upper[i] - 1 - offset[i]) == 0 ? 1 + : (m_rp.m_upper[i] - m_rp.m_tile[i]) > 0 + ? (m_rp.m_upper[i] - offset[i]) + : (m_rp.m_upper[i] - + m_rp.m_lower[i]); // when single tile encloses range } } @@ -2842,7 +2837,7 @@ struct HostIterateTile::value, int, Tag> m_tag; + std::conditional_t, int, Tag> m_tag; }; // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp index e1273ab9e3..e6b2fcbef4 100644 --- a/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp +++ b/lib/kokkos/core/src/impl/KokkosExp_IterateTileGPU.hpp @@ -41,13 +41,13 @@ struct EmulateCUDADim3 { template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f((Args &&) args...); + f((Args&&)args...); } template KOKKOS_IMPL_FORCEINLINE_FUNCTION std::enable_if_t::value> _tag_invoke(Functor const& f, Args&&... args) { - f(Tag{}, (Args &&) args...); + f(Tag{}, (Args&&)args...); } template , Args&&... args) { - _tag_invoke(f, vals[Idxs]..., (Args &&) args...); + _tag_invoke(f, vals[Idxs]..., (Args&&)args...); } template @@ -63,7 +63,7 @@ KOKKOS_IMPL_FORCEINLINE_FUNCTION void _tag_invoke_array(Functor const& f, T (&vals)[N], Args&&... args) { _tag_invoke_array_helper(f, vals, std::make_index_sequence{}, - (Args &&) args...); + (Args&&)args...); } // ------------------------------------------------------------------ // diff --git a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp index d77ec0c753..b483653021 100644 --- a/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_AnalyzePolicy.hpp @@ -143,7 +143,7 @@ struct AnalyzeExecPolicyUseMatcher, Trait, Traits...> { static constexpr auto trigger_error_message = show_name_of_invalid_execution_policy_trait{}; static_assert( - /* always false: */ std::is_void::value, + /* always false: */ std::is_void_v, "Unknown execution policy trait. Search compiler output for " "'show_name_of_invalid_execution_policy_trait' to see the type of the " "invalid trait."); diff --git a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp index d8ab77b205..4ea0b8d343 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ChaseLev.hpp @@ -95,12 +95,12 @@ struct non_owning_variable_size_circular_buffer { non_owning_variable_size_circular_buffer( non_owning_variable_size_circular_buffer const&) = delete; non_owning_variable_size_circular_buffer( - non_owning_variable_size_circular_buffer&&) = default; - non_owning_variable_size_circular_buffer& operator =( - non_owning_variable_size_circular_buffer const&) = delete; - non_owning_variable_size_circular_buffer& operator =( non_owning_variable_size_circular_buffer&&) = default; - ~non_owning_variable_size_circular_buffer() = default; + non_owning_variable_size_circular_buffer& operator=( + non_owning_variable_size_circular_buffer const&) = delete; + non_owning_variable_size_circular_buffer& operator=( + non_owning_variable_size_circular_buffer&&) = default; + ~non_owning_variable_size_circular_buffer() = default; KOKKOS_FORCEINLINE_FUNCTION constexpr size_type size() const noexcept { return m_size; } @@ -138,7 +138,7 @@ struct ChaseLevDeque { public: template ::value>> + std::is_default_constructible_v>> ChaseLevDeque() : m_array() {} explicit ChaseLevDeque(CircularBufferT buffer) : m_array(std::move(buffer)) {} @@ -165,7 +165,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; @@ -226,7 +226,7 @@ struct ChaseLevDeque { #ifdef _WIN32 Kokkos::memory_fence(); bool const success = - Kokkos::atomic_compare_exchange_strong(&m_top, t, t + 1); + (t == Kokkos::atomic_compare_exchange(&m_top, t, t + 1)); Kokkos::memory_fence(); if (!success) { return_value = nullptr; diff --git a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp index 6e3d99ebd6..ee53fd8bc6 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ClockTic.hpp @@ -27,8 +27,9 @@ // To use OpenCL(TM) built-in intrinsics inside kernels, we have to // forward-declare their prototype, also see // https://github.com/intel/pti-gpu/blob/master/chapters/binary_instrumentation/OpenCLBuiltIn.md -#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +#if defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) extern SYCL_EXTERNAL unsigned long __attribute__((overloadable)) intel_get_cycle_counter(); #endif @@ -55,8 +56,10 @@ KOKKOS_IMPL_DEVICE_FUNCTION inline uint64_t clock_tic_device() noexcept { // Return value of 64-bit hi-res clock register. return clock64(); -#elif defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_ARCH_INTEL_GPU) && \ - defined(__SYCL_DEVICE_ONLY__) +// FIXME_SYCL We can only return something useful for Intel GPUs and with RDC +#elif defined(KOKKOS_ENABLE_SYCL) && \ + defined(KOKKOS_ENABLE_SYCL_RELOCATABLE_DEVICE_CODE) && \ + defined(KOKKOS_ARCH_INTEL_GPU) && defined(__SYCL_DEVICE_ONLY__) return intel_get_cycle_counter(); diff --git a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp index e6dd3c6339..d7319e80c8 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Combined_Reducer.hpp @@ -93,7 +93,7 @@ struct CombinedReducerValueImpl, std::move(arg_values))... {} template - KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { + KOKKOS_INLINE_FUNCTION ValueType& get() & noexcept { return this->CombinedReducerValueItemImpl::ref(); } template @@ -181,7 +181,7 @@ struct CombinedReducerImpl, Space, KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl( - CombinedReducerImpl&&) = default; + CombinedReducerImpl&&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( CombinedReducerImpl const&) = default; KOKKOS_DEFAULTED_FUNCTION constexpr CombinedReducerImpl& operator=( @@ -192,8 +192,8 @@ struct CombinedReducerImpl, Space, template KOKKOS_FUNCTION constexpr explicit CombinedReducerImpl( value_type& value, ReducersDeduced&&... reducers) noexcept - : CombinedReducerStorageImpl((ReducersDeduced &&) - reducers)..., + : CombinedReducerStorageImpl( + (ReducersDeduced&&)reducers)..., m_value_view(&value) {} KOKKOS_FUNCTION constexpr void join(value_type& dest, @@ -348,8 +348,8 @@ struct CombinedReductionFunctorWrapperImpl< IndexOrMemberOrTagType1&& arg_first, IndexOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IndexOrMemberOrTagType1 &&) arg_first, - (IndexOrMemberTypesThenValueType &&) args...); + (IndexOrMemberOrTagType1&&)arg_first, + (IndexOrMemberTypesThenValueType&&)args...); } // end call operator }}}2 @@ -369,19 +369,19 @@ struct CombinedReductionFunctorWrapperImpl< template KOKKOS_FORCEINLINE_FUNCTION std::enable_if_t< - !std::is_same, value_type>::value> + !std::is_same_v, value_type>> _call_op_impl(IdxOrMemberTypes&&... idxs, IdxOrMemberType1&& idx, IdxOrMemberTypesThenValueType&&... args) const { this->template _call_op_impl( - (IdxOrMemberTypes &&) idxs..., (IdxOrMemberType1 &&) idx, - (IdxOrMemberTypesThenValueType &&) args...); + (IdxOrMemberTypes&&)idxs..., (IdxOrMemberType1&&)idx, + (IdxOrMemberTypesThenValueType&&)args...); } // base case template KOKKOS_FORCEINLINE_FUNCTION void _call_op_impl(IdxOrMemberTypes&&... idxs, value_type& out) const { - m_functor((IdxOrMemberTypes &&) idxs..., + m_functor((IdxOrMemberTypes&&)idxs..., out.template get()...); } }; @@ -464,8 +464,8 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer_value( typename _reducer_from_arg_t::value_type...>{ // This helper function is now poorly named after refactoring. - _get_value_from_combined_reducer_ctor_arg((ReferencesOrViewsOrReducers &&) - args)...}; + _get_value_from_combined_reducer_ctor_arg( + (ReferencesOrViewsOrReducers&&)args)...}; //---------------------------------------- } @@ -480,7 +480,7 @@ KOKKOS_INLINE_FUNCTION constexpr auto make_combined_reducer( Space, _reducer_from_arg_t...>; return reducer_type(value, _reducer_from_arg_t{ - (ReferencesOrViewsOrReducers &&) args}...); + (ReferencesOrViewsOrReducers&&)args}...); //---------------------------------------- } diff --git a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp index ca4edce5c3..9bde2f72a3 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ConcurrentBitset.hpp @@ -110,15 +110,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -132,7 +132,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -194,15 +195,15 @@ struct concurrent_bitset { // when is full at the atomic_fetch_add(+1) // then a release occurs before the atomic_fetch_add(-1). - const uint32_t state = (uint32_t)Kokkos::atomic_fetch_add( - reinterpret_cast(buffer), 1); + const uint32_t state = + Kokkos::atomic_fetch_add(const_cast(buffer), 1); const uint32_t state_error = state_header != (state & state_header_mask); const uint32_t state_bit_used = state & state_used_mask; if (state_error || (bit_bound <= state_bit_used)) { - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return state_error ? type(-2, -2) : type(-1, -1); } @@ -216,7 +217,8 @@ struct concurrent_bitset { while (1) { const uint32_t word = bit >> bits_per_int_lg2; const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = Kokkos::atomic_fetch_or(buffer + word + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + word + 1, mask); if (!(prev & mask)) { // Successfully claimed 'result.first' by @@ -262,8 +264,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_and(buffer + (bit >> bits_per_int_lg2) + 1, ~mask); + const uint32_t prev = Kokkos::atomic_fetch_and( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, ~mask); if (!(prev & mask)) { return -1; @@ -273,7 +275,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); // Flush the store-release Kokkos::memory_fence(); @@ -299,8 +301,8 @@ struct concurrent_bitset { } const uint32_t mask = 1u << (bit & bits_per_int_mask); - const uint32_t prev = - Kokkos::atomic_fetch_or(buffer + (bit >> bits_per_int_lg2) + 1, mask); + const uint32_t prev = Kokkos::atomic_fetch_or( + const_cast(buffer) + (bit >> bits_per_int_lg2) + 1, mask); if (!(prev & mask)) { return -1; @@ -310,7 +312,7 @@ struct concurrent_bitset { Kokkos::memory_fence(); const int count = - Kokkos::atomic_fetch_add(reinterpret_cast(buffer), -1); + Kokkos::atomic_fetch_sub(const_cast(buffer), 1); return (count & state_used_mask) - 1; } diff --git a/lib/kokkos/core/src/impl/Kokkos_Core.cpp b/lib/kokkos/core/src/impl/Kokkos_Core.cpp index 532709aa98..72f33ffaab 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Core.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_Core.cpp @@ -138,7 +138,7 @@ int get_device_count() { KOKKOS_IMPL_HIP_SAFE_CALL(hipGetDeviceCount(&count)); return count; #elif defined(KOKKOS_ENABLE_SYCL) - return Kokkos::Experimental::Impl::get_sycl_devices().size(); + return Kokkos::Impl::get_sycl_devices().size(); #elif defined(KOKKOS_ENABLE_OPENACC) return acc_get_num_devices( Kokkos::Experimental::Impl::OpenACC_Traits::dev_type); @@ -183,7 +183,7 @@ std::vector const& Kokkos::Impl::get_visible_devices() { #elif defined(KOKKOS_ENABLE_OPENMPTARGET) int device = omp_get_default_device(); // FIXME_OPENMPTARGET #elif defined(KOKKOS_ENABLE_SYCL) - int device = Experimental::Impl::SYCLInternal::m_syclDev; + int device = Impl::SYCLInternal::m_syclDev; #else int device = -1; return device; @@ -271,7 +271,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: local rank " << local_rank << " is outside the bounds of resource groups provided by CTest. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the resource types allocated to this resource group @@ -284,7 +284,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_name << " is not specified. Raised" << " by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Look for the device type specified in CTEST_KOKKOS_DEVICE_TYPE @@ -308,7 +308,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { ss << "Error: device type '" << ctest_kokkos_device_type << "' not included in " << ctest_resource_group_name << ". Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } // Get the device ID @@ -324,7 +324,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: " << ctest_resource_group_id_name << " is not specified. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } auto const* comma = std::strchr(resource_str, ','); @@ -332,7 +332,7 @@ int Kokkos::Impl::get_ctest_gpu(int local_rank) { std::ostringstream ss; ss << "Error: invalid value of " << ctest_resource_group_id_name << ": '" << resource_str << "'. Raised by Kokkos::Impl::get_ctest_gpu()."; - throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } std::string id(resource_str + 3, comma - resource_str - 3); @@ -613,7 +613,7 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #endif declare_configuration_metadata("architecture", "Default Device", - typeid(Kokkos::DefaultExecutionSpace).name()); + Kokkos::DefaultExecutionSpace::name()); #if defined(KOKKOS_ARCH_A64FX) declare_configuration_metadata("architecture", "CPU architecture", "A64FX"); @@ -666,6 +666,9 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { #elif defined(KOKKOS_ARCH_RISCV_SG2042) declare_configuration_metadata("architecture", "CPU architecture", "SG2042 (RISC-V)") +#elif defined(KOKKOS_ARCH_RISCV_RVA22V) + declare_configuration_metadata("architecture", "CPU architecture", + "RVA22V (RISC-V)") #else declare_configuration_metadata("architecture", "CPU architecture", "none"); #endif @@ -738,8 +741,8 @@ void pre_initialize_internal(const Kokkos::InitializationSettings& settings) { declare_configuration_metadata("architecture", "GPU architecture", "HOPPER90"); #elif defined(KOKKOS_ARCH_AMD_GFX906) - declare_configuration_metadata("architecture", "GPU architecture", - "AMD_GFX906"); + declare_configuration_metadata("architecture", "GPU architecture", + "AMD_GFX906"); #elif defined(KOKKOS_ARCH_AMD_GFX908) declare_configuration_metadata("architecture", "GPU architecture", "AMD_GFX908"); @@ -976,7 +979,7 @@ void Kokkos::Impl::parse_environment_variables( Tools::Impl::parse_environment_variables(tools_init_arguments); if (init_result.result == Tools::Impl::InitializationStatus::environment_argument_mismatch) { - Impl::throw_runtime_exception(init_result.error_message); + Kokkos::abort(init_result.error_message.c_str()); } combine(settings, tools_init_arguments); diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp index c71c21d2ac..cd00fdadeb 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNodeKernel.hpp @@ -36,15 +36,22 @@ struct GraphNodeKernelDefaultImpl { // TODO @graphs decide if this should use vtable or intrusive erasure via // function pointers like in the rest of the graph interface virtual void execute_kernel() = 0; + + GraphNodeKernelDefaultImpl() = default; + + explicit GraphNodeKernelDefaultImpl(ExecutionSpace exec) + : m_execution_space(std::move(exec)) {} + + ExecutionSpace m_execution_space; }; // TODO Indicate that this kernel specialization is only for the Host somehow? template class GraphNodeKernelImpl - : public PatternImplSpecializationFromTag::type, - public GraphNodeKernelDefaultImpl { + : public GraphNodeKernelDefaultImpl, + public PatternImplSpecializationFromTag::type { public: using base_t = typename PatternImplSpecializationFromTag - GraphNodeKernelImpl(std::string const&, ExecutionSpace const&, - Functor arg_functor, PolicyDeduced&& arg_policy, - ArgsDeduced&&... args) - : base_t(std::move(arg_functor), (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...), - execute_kernel_vtable_base_t() {} + GraphNodeKernelImpl(std::string const &, ExecutionSpace const &, + Functor arg_functor, PolicyDeduced &&arg_policy, + ArgsDeduced &&...args) + : execute_kernel_vtable_base_t(arg_policy.space()), + base_t(std::move(arg_functor), (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) {} // FIXME @graph Forward through the instance once that works in the backends template - GraphNodeKernelImpl(ExecutionSpace const& ex, Functor arg_functor, - PolicyDeduced&& arg_policy, ArgsDeduced&&... args) + GraphNodeKernelImpl(ExecutionSpace const &ex, Functor arg_functor, + PolicyDeduced &&arg_policy, ArgsDeduced &&...args) : GraphNodeKernelImpl("", ex, std::move(arg_functor), - (PolicyDeduced &&) arg_policy, - (ArgsDeduced &&) args...) {} + (PolicyDeduced &&)arg_policy, + (ArgsDeduced &&)args...) { + // FIXME This constructor seem unused. + } - void execute_kernel() final { this->base_t::execute(); } + void execute_kernel() override final { this->base_t::execute(); } }; // end GraphNodeKernelImpl }}}1 @@ -88,7 +97,7 @@ struct GraphNodeAggregateKernelDefaultImpl using is_graph_kernel = std::true_type; }; using graph_kernel = GraphNodeAggregateKernelDefaultImpl; - void execute_kernel() final {} + void execute_kernel() override final {} }; } // end namespace Impl diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp index 223ae391ab..31d147ea89 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_GraphNode_Impl.hpp @@ -69,10 +69,10 @@ struct GraphNodeBackendSpecificDetails { GraphNodeBackendSpecificDetails(GraphNodeBackendSpecificDetails&&) noexcept = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails const&) = delete; - GraphNodeBackendSpecificDetails& operator =( + GraphNodeBackendSpecificDetails& operator=( GraphNodeBackendSpecificDetails&&) noexcept = delete; ~GraphNodeBackendSpecificDetails() = default; @@ -92,6 +92,18 @@ struct GraphNodeBackendSpecificDetails { m_is_aggregate = true; } + // A node is awaitable if it can execute a kernel. + // A root node or an aggregate node cannot be waited for, because it does + // not launch anything. + bool awaitable() const { return (!m_is_root) && (!m_is_aggregate); } + + // Retrieve the execution space instance that has been passed to + // the kernel at construction phase. + const ExecutionSpace& get_execution_space() const { + KOKKOS_EXPECTS(m_kernel_ptr != nullptr) + return m_kernel_ptr->m_execution_space; + } + void set_predecessor( std::shared_ptr> arg_pred_impl) { @@ -104,7 +116,7 @@ struct GraphNodeBackendSpecificDetails { m_predecessors.push_back(std::move(arg_pred_impl)); } - void execute_node() { + void execute_node(const ExecutionSpace& exec) { // This node could have already been executed as the predecessor of some // other KOKKOS_EXPECTS(bool(m_kernel_ptr) || m_has_executed) @@ -115,8 +127,18 @@ struct GraphNodeBackendSpecificDetails { // supported semantics, but instinct I have feels like it should be... m_has_executed = true; for (auto const& predecessor : m_predecessors) { - predecessor->execute_node(); + predecessor->execute_node(exec); } + + // Before executing the kernel, be sure to fence the execution space + // instance of predecessors. + for (const auto& predecessor : m_predecessors) { + if (predecessor->awaitable() && + predecessor->get_execution_space() != this->get_execution_space()) + predecessor->get_execution_space().fence( + "Kokkos::DefaultGraphNode::execute_node: sync with predecessors"); + } + m_kernel_ptr->execute_kernel(); } KOKKOS_ENSURES(m_has_executed) diff --git a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp index 05d4854919..8dfa19a178 100644 --- a/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_Default_Graph_Impl.hpp @@ -58,12 +58,12 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { // Not movable or copyable; it spends its whole live as a shared_ptr in the // Graph object - GraphImpl() = default; - GraphImpl(GraphImpl const&) = delete; - GraphImpl(GraphImpl&&) = delete; + GraphImpl() = default; + GraphImpl(GraphImpl const&) = delete; + GraphImpl(GraphImpl&&) = delete; GraphImpl& operator=(GraphImpl const&) = delete; - GraphImpl& operator=(GraphImpl&&) = delete; - ~GraphImpl() = default; + GraphImpl& operator=(GraphImpl&&) = delete; + ~GraphImpl() = default; explicit GraphImpl(ExecutionSpace arg_space) : execution_space_instance_storage_base_t(std::move(arg_space)) {} @@ -136,17 +136,40 @@ struct GraphImpl : private ExecutionSpaceInstanceStorage { return rv; } - void submit() { + void instantiate() { + KOKKOS_EXPECTS(!m_has_been_instantiated); + m_has_been_instantiated = true; + } + + void submit(const ExecutionSpace& exec) { + if (!m_has_been_instantiated) instantiate(); // This reset is gross, but for the purposes of our simple host // implementation... for (auto& sink : m_sinks) { sink->reset_has_executed(); } + + // We don't know where the nodes will execute, so we need to fence the given + // execution space instance before proceeding. This is the simplest way + // of guaranteeing that the kernels in the graph are correctly "enqueued". + exec.fence( + "Kokkos::DefaultGraph::submit: fencing before launching graph nodes"); + for (auto& sink : m_sinks) { - sink->execute_node(); + sink->execute_node(exec); + } + + // Once all sinks have been executed, we need to fence them. + for (const auto& sink : m_sinks) { + if (sink->awaitable() && sink->get_execution_space() != exec) + sink->get_execution_space().fence( + "Kokkos::DefaultGraph::submit: fencing before ending graph submit"); } } + private: + bool m_has_been_instantiated = false; + // end required customizations }}}2 //---------------------------------------------------------------------------- }; diff --git a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp index 8ba94ba4cc..a8a4d6617b 100644 --- a/lib/kokkos/core/src/impl/Kokkos_EBO.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_EBO.hpp @@ -52,16 +52,16 @@ struct EBOBaseImpl; template class CtorNotOnDevice> struct EBOBaseImpl { template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && !CtorNotOnDevice::value, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&...) noexcept {} template ::value && - std::is_constructible::value && + std::enable_if_t && + std::is_constructible_v && CtorNotOnDevice::value, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&...) noexcept {} @@ -110,18 +110,18 @@ struct EBOBaseImpl { T m_ebo_object; template ::value && + std::enable_if_t && !CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, int> = 0> KOKKOS_FORCEINLINE_FUNCTION constexpr explicit EBOBaseImpl( Args&&... args) noexcept(noexcept(T(std::forward(args)...))) : m_ebo_object(std::forward(args)...) {} template ::value && + std::enable_if_t && CTorsNotOnDevice::value && - std::is_constructible::value, + std::is_constructible_v, long> = 0> inline constexpr explicit EBOBaseImpl(Args&&... args) noexcept( noexcept(T(std::forward(args)...))) @@ -167,9 +167,9 @@ struct EBOBaseImpl { template class CtorsNotOnDevice = NoCtorsNotOnDevice> struct StandardLayoutNoUniqueAddressMemberEmulation - : EBOBaseImpl::value, CtorsNotOnDevice> { + : EBOBaseImpl, CtorsNotOnDevice> { private: - using ebo_base_t = EBOBaseImpl::value, CtorsNotOnDevice>; + using ebo_base_t = EBOBaseImpl, CtorsNotOnDevice>; public: using ebo_base_t::ebo_base_t; diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp index 04c5e0bd22..58a5de2aa6 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecPolicy.cpp @@ -41,7 +41,7 @@ void team_policy_check_valid_storage_level_argument(int level) { std::stringstream ss; ss << "TeamPolicy::set_scratch_size(/*level*/ " << level << ", ...) storage level argument must be 0 or 1 to be valid\n"; - Impl::throw_runtime_exception(ss.str()); + abort(ss.str().c_str()); } } diff --git a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp index 58ed54275a..5805b78ee7 100644 --- a/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_ExecSpaceManager.hpp @@ -123,14 +123,14 @@ template struct ExecSpaceDerived : ExecSpaceBase { static_assert(check_valid_execution_space()); static_assert(check_is_regular()); - void initialize(InitializationSettings const& settings) final { + void initialize(InitializationSettings const& settings) override final { ExecutionSpace::impl_initialize(settings); } - void finalize() final { ExecutionSpace::impl_finalize(); } - void static_fence(std::string const& label) final { + void finalize() override final { ExecutionSpace::impl_finalize(); } + void static_fence(std::string const& label) override final { ExecutionSpace::impl_static_fence(label); } - void print_configuration(std::ostream& os, bool verbose) final { + void print_configuration(std::ostream& os, bool verbose) override final { ExecutionSpace().print_configuration(os, verbose); } }; diff --git a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp b/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp deleted file mode 100644 index 4726a87b97..0000000000 --- a/lib/kokkos/core/src/impl/Kokkos_FixedBufferMemoryPool.hpp +++ /dev/null @@ -1,279 +0,0 @@ -//@HEADER -// ************************************************************************ -// -// Kokkos v. 4.0 -// Copyright (2022) National Technology & Engineering -// Solutions of Sandia, LLC (NTESS). -// -// Under the terms of Contract DE-NA0003525 with NTESS, -// the U.S. Government retains certain rights in this software. -// -// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions. -// See https://kokkos.org/LICENSE for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//@HEADER - -#ifndef KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP -#define KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP - -#include -#include - -#include -#include - -namespace Kokkos { -namespace Impl { - -template -class FixedBlockSizeMemoryPool - : private MemorySpaceInstanceStorage { - public: - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - - private: - using memory_space_storage_base = - MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { - union { - char ignore; - char data[Size]; - }; - }; - - static constexpr auto actual_size = sizeof(Block); - - // TODO shared allocation tracker - // TODO @optimization put the index values on different cache lines (CPU) or - // pages (GPU)? - - tracker_type m_tracker = {}; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - Kokkos::OwningRawPtr m_first_block = nullptr; - Kokkos::OwningRawPtr m_free_indices = nullptr; - - enum : size_type { IndexInUse = ~size_type(0) }; - - public: - FixedBlockSizeMemoryPool(memory_space const& mem_space, size_type num_blocks) - : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block)); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = - record_type::allocate(mem_space, "Kokkos::FixedBlockSizeMemPool_blocks", - num_blocks * sizeof(size_type)); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for (size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool(memory_space const& mem_space, - size_t mempool_capacity, unsigned, unsigned, - unsigned) - : FixedBlockSizeMemoryPool( - mem_space, mempool_capacity / - actual_size) { /* forwarding ctor, must be empty */ - } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool( - FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=( - FixedBlockSizeMemoryPool const&) = default; - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept { - (void)alloc_size; - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], - current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if (free_idx == IndexInUse) { - return nullptr; - } else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type /*alloc_size*/) const noexcept { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && - offset / actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add( - (volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } -}; - -#if 0 -template < - class DeviceType, - size_t Size, - size_t Align=1, - class SizeType = typename DeviceType::execution_space::size_type -> -class FixedBlockSizeChaseLevMemoryPool - : private MemorySpaceInstanceStorage -{ -public: - - using memory_space = typename DeviceType::memory_space; - using size_type = SizeType; - -private: - - using memory_space_storage_base = MemorySpaceInstanceStorage; - using tracker_type = Kokkos::Impl::SharedAllocationTracker; - using record_type = Kokkos::Impl::SharedAllocationRecord; - - struct alignas(Align) Block { union { char ignore; char data[Size]; }; }; - - static constexpr auto actual_size = sizeof(Block); - - tracker_type m_tracker = { }; - size_type m_num_blocks = 0; - size_type m_first_free_idx = 0; - size_type m_last_free_idx = 0; - - - enum : size_type { IndexInUse = ~size_type(0) }; - -public: - - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_type num_blocks - ) : memory_space_storage_base(mem_space), - m_tracker(), - m_num_blocks(num_blocks), - m_first_free_idx(0), - m_last_free_idx(num_blocks) - { - // TODO alignment? - auto block_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(Block) - ); - KOKKOS_ASSERT(intptr_t(block_record->data()) % Align == 0); - m_tracker.assign_allocated_record_to_uninitialized(block_record); - m_first_block = (Block*)block_record->data(); - - auto idx_record = record_type::allocate( - mem_space, "FixedBlockSizeMemPool_blocks", num_blocks * sizeof(size_type) - ); - KOKKOS_ASSERT(intptr_t(idx_record->data()) % alignof(size_type) == 0); - m_tracker.assign_allocated_record_to_uninitialized(idx_record); - m_free_indices = (size_type*)idx_record->data(); - - for(size_type i = 0; i < num_blocks; ++i) { - m_free_indices[i] = i; - } - - Kokkos::memory_fence(); - } - - // For compatibility with MemoryPool<> - FixedBlockSizeMemoryPool( - memory_space const& mem_space, - size_t mempool_capacity, - unsigned, unsigned, unsigned - ) : FixedBlockSizeMemoryPool(mem_space, mempool_capacity / actual_size) - { /* forwarding ctor, must be empty */ } - - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool() = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool(FixedBlockSizeMemoryPool const&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool&&) = default; - KOKKOS_DEFAULTED_FUNCTION FixedBlockSizeMemoryPool& operator=(FixedBlockSizeMemoryPool const&) = default; - - - KOKKOS_INLINE_FUNCTION - void* allocate(size_type alloc_size) const noexcept - { - KOKKOS_EXPECTS(alloc_size <= Size); - auto free_idx_counter = Kokkos::atomic_fetch_add((volatile size_type*)&m_first_free_idx, size_type(1)); - auto free_idx_idx = free_idx_counter % m_num_blocks; - - // We don't have exclusive access to m_free_indices[free_idx_idx] because - // the allocate counter might have lapped us since we incremented it - auto current_free_idx = m_free_indices[free_idx_idx]; - size_type free_idx = IndexInUse; - free_idx = - Kokkos::atomic_compare_exchange(&m_free_indices[free_idx_idx], current_free_idx, free_idx); - Kokkos::memory_fence(); - - // TODO figure out how to decrement here? - - if(free_idx == IndexInUse) { - return nullptr; - } - else { - return (void*)&m_first_block[free_idx]; - } - } - - KOKKOS_INLINE_FUNCTION - void deallocate(void* ptr, size_type alloc_size) const noexcept - { - // figure out which block we are - auto offset = intptr_t(ptr) - intptr_t(m_first_block); - - KOKKOS_EXPECTS(offset % actual_size == 0 && offset/actual_size < m_num_blocks); - - Kokkos::memory_fence(); - auto last_idx_idx = Kokkos::atomic_fetch_add((volatile size_type*)&m_last_free_idx, size_type(1)); - last_idx_idx %= m_num_blocks; - m_free_indices[last_idx_idx] = offset / actual_size; - } - -}; -#endif - -} // end namespace Impl -} // end namespace Kokkos - -#endif // KOKKOS_IMPL_KOKKOS_FIXEDBUFFERMEMORYPOOL_HPP diff --git a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp index e844a5295e..29a365e6e4 100644 --- a/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_FunctorAnalysis.hpp @@ -118,8 +118,8 @@ struct FunctorAnalysis { using functor_has_space = has_execution_space; static_assert(!policy_has_space::value || !functor_has_space::value || - std::is_same::value, + std::is_same_v, "Execution Policy and Functor execution space must match"); //---------------------------------------- @@ -136,9 +136,8 @@ struct FunctorAnalysis { typename std::is_void::type> { using type = typename F::value_type; - static_assert(!std::is_reference::value && - std::rank::value <= 1 && - std::extent::value == 0, + static_assert(!std::is_reference_v && std::rank_v <= 1 && + std::extent_v == 0, "Kokkos Functor::value_type is T or T[]"); }; @@ -149,7 +148,7 @@ struct FunctorAnalysis { template ::type, - bool T = std::is_void::value> + bool T = std::is_void_v> struct deduce_value_type { using type = V; }; @@ -290,8 +289,8 @@ struct FunctorAnalysis { using candidate_type = typename deduce_value_type::type; enum { - candidate_is_void = std::is_void::value, - candidate_is_array = std::rank::value == 1 + candidate_is_void = std::is_void_v, + candidate_is_array = std::rank_v == 1 }; //---------------------------------------- @@ -306,7 +305,7 @@ struct FunctorAnalysis { using value_type = std::remove_extent_t; - static_assert(!std::is_const::value, + static_assert(!std::is_const_v, "Kokkos functor operator reduce argument cannot be const"); private: @@ -614,21 +613,20 @@ struct FunctorAnalysis { }; template - struct DeduceJoinNoTag::value || - (!is_reducer::value && - std::is_void::value)) && - detected_join_no_tag::value>> + struct DeduceJoinNoTag< + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + detected_join_no_tag::value>> : public has_join_no_tag_function { enum : bool { value = true }; }; template struct DeduceJoinNoTag< - F, - std::enable_if_t<(is_reducer::value || - (!is_reducer::value && std::is_void::value)) && - (!detected_join_no_tag::value && - detected_volatile_join_no_tag::value)>> + F, std::enable_if_t<(is_reducer::value || + (!is_reducer::value && std::is_void_v)) && + (!detected_join_no_tag::value && + detected_volatile_join_no_tag::value)>> : public has_volatile_join_no_tag_function { enum : bool { value = true }; static_assert(Impl::dependent_false_v, @@ -735,8 +733,8 @@ struct FunctorAnalysis { template struct DeduceInitNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_init_no_tag_function::enable_if( &F::init))>> : public has_init_no_tag_function { @@ -835,8 +833,8 @@ struct FunctorAnalysis { template struct DeduceFinalNoTag< - F, std::enable_if_t::value || (!is_reducer::value && - std::is_void::value), + F, std::enable_if_t::value || + (!is_reducer::value && std::is_void_v), decltype(has_final_no_tag_function::enable_if( &F::final))>> : public has_final_no_tag_function { @@ -906,14 +904,14 @@ struct FunctorAnalysis { Functor m_functor; template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return m_functor.value_count; } template - KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() const - noexcept { + KOKKOS_INLINE_FUNCTION constexpr std::enable_if_t len() + const noexcept { return candidate_is_void ? 0 : 1; } @@ -973,8 +971,8 @@ struct FunctorAnalysis { DeduceJoin<>::join(&m_functor, dst, src); } - KOKKOS_INLINE_FUNCTION reference_type init(ValueType* const dst) const - noexcept { + KOKKOS_INLINE_FUNCTION reference_type + init(ValueType* const dst) const noexcept { DeduceInit<>::init(&m_functor, dst); return reference(dst); } @@ -987,11 +985,11 @@ struct FunctorAnalysis { KOKKOS_INLINE_FUNCTION const Functor& get_functor() const { return m_functor; } - Reducer(Reducer const&) = default; - Reducer(Reducer&&) = default; + Reducer(Reducer const&) = default; + Reducer(Reducer&&) = default; Reducer& operator=(Reducer const&) = delete; - Reducer& operator=(Reducer&&) = delete; - ~Reducer() = default; + Reducer& operator=(Reducer&&) = delete; + ~Reducer() = default; KOKKOS_INLINE_FUNCTION explicit constexpr Reducer( Functor const& arg_functor) noexcept diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp index 56f95c814d..6d3ebf64be 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl.hpp @@ -56,7 +56,7 @@ struct GraphAccess { static_assert( Kokkos::Impl::is_specialization_of::value, "Kokkos Internal Error in graph interface"); - return std::make_shared((Args &&) args...); + return std::make_shared((Args&&)args...); } template ::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_node_ptr()`)"); - return ((NodeRef &&) node_ref).get_node_ptr(); + return ((NodeRef&&)node_ref).get_node_ptr(); } template @@ -93,7 +93,7 @@ struct GraphAccess { Kokkos::Experimental::GraphNodeRef>::value, "Kokkos Internal Implementation error (bad argument to " "`GraphAccess::get_graph_weak_ptr()`)"); - return ((NodeRef &&) node_ref).get_graph_weak_ptr(); + return ((NodeRef&&)node_ref).get_graph_weak_ptr(); } // end accessors for private members of public interface }}}2 diff --git a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp index 2ab05cb8e4..b02a265472 100644 --- a/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp +++ b/lib/kokkos/core/src/impl/Kokkos_GraphImpl_Utilities.hpp @@ -54,9 +54,9 @@ template